Prepare county level data

Overview of time windows

US prevalence: 03/09 - 04/18 US socdist: 03/01 - 05/03

UK prevalence: 03/09 - 04/10 UK socdist: 03/01 - 03/31

GER prevalence: 01/01 - 04/25 GER socdist: 02/25 - 04/27

Read and format prevalence data


df_us_covid <- read_csv('timeseries_usa_county_march1_april_09.csv')
df_us_covid$time %>% max()
[1] 40
df_us_covid <- df_us_covid %>% 
  filter(time <=31) %>% 
  arrange(countyfips) %>%
  mutate(stabil = -stabil) %>%
  dplyr::rename(county_fips = countyfips,
         pers_o = open, 
         pers_c = sci,
         pers_e = extra,
         pers_a = agree,
         pers_n = stabil)


df_us_covid <- df_us_covid %>% 
  dplyr::select(county_fips, time, mark, rate_day, pers_o,
                pers_c, pers_e, pers_a, pers_n)


df_us_covid %>% head()

Conty level controls


df_us_ctrl <- read.csv('controls_US.csv')

df_us_ctrl <- df_us_ctrl %>% select(-county_name) %>% 
  rename(county_fips = county)

df_us_ctrl %>% head()
NA

Social distancing data unacast


df_us_socdist <- read_csv('0409_sds-full-county.csv')

# create sequence of dates
date_sequence <- seq.Date(as.Date('2020-03-09'),
                          as.Date('2020-03-31'), 1)
                     

# create data frame with time sequence
df_dates = tibble(date_sequence, 1:length(date_sequence)) 
names(df_dates) <- c('date', 'time')

# merge day index with gps data
df_us_socdist = df_us_socdist %>% 
  merge(df_dates, by='date') %>% 
  arrange(county_fips) %>%
  as_tibble()

df_us_socdist %>% head()

Social distancing data FB


fb_files <- list.files('../FB Data/US individual files/Mobility/',
                       '*.csv', full.names = T)

df_us_socdist_fb <- fb_files %>% 
  map(read_csv) %>% bind_rows()

df_us_socdist_fb$ds %>% summary()
        Min.      1st Qu.       Median         Mean      3rd Qu.         Max. 
"2020-03-01" "2020-03-16" "2020-04-01" "2020-04-01" "2020-04-17" "2020-05-03" 
df_us_socdist_fb <- df_us_socdist_fb %>%
  select(-age_bracket, -gender, -baseline_name, -baseline_type) %>%
  rename(date = ds,
         county_fips = polygon_id,
         county_name = polygon_name,
         socdist_tiles = all_day_bing_tiles_visited_relative_change,
         socdist_single_tile = all_day_ratio_single_tile_users)

df_us_socdist_fb <- df_us_socdist_fb %>%
  filter(date >= '2020-03-09' & date <= '2020-03-31') %>%
  group_by(county_fips) %>% 
  arrange(date) %>% 
  mutate(time = row_number()) %>%
  ungroup() %>% 
  arrange(county_fips)

head(df_us_socdist_fb)

Sanity check socdist data

socdist <- df_us_socdist %>% merge(df_us_socdist_fb, by = c("county_fips", "time")) 

socdist[c('daily_distance_diff', 'daily_visitation_diff', 'socdist_tiles', 'socdist_single_tile')] %>% 
  cor(use = 'pairwise.complete')
                      daily_distance_diff daily_visitation_diff socdist_tiles socdist_single_tile
daily_distance_diff             1.0000000             0.1361318     0.3061683          -0.2746350
daily_visitation_diff           0.1361318             1.0000000     0.3826102          -0.3624062
socdist_tiles                   0.3061683             0.3826102     1.0000000          -0.7544123
socdist_single_tile            -0.2746350            -0.3624062    -0.7544123           1.0000000

Merge data


df_us <- plyr::join_all(list(df_us_covid, df_us_socdist_fb),
                  by = c('county_fips', 'time'), 
                  type = 'inner') %>% 
  plyr::join(df_us_ctrl, by='county_fips') %>% 
  arrange(county_fips, time)

# keep only counties with full data
fips_complete <- df_us %>% 
  group_by(county_fips) %>% 
  summarize(n = n()) %>% 
  filter(n==max(.$n)) %>% 
  .$county_fips

df_us <- df_us %>% filter(county_fips %in% fips_complete)

Explore data

Plot distributions


# distribution of observations per county
df_us %>% group_by(county_fips) %>% 
  summarise(mark = mean(mark)) %>% 
  ggplot(aes(x=mark)) + 
  geom_histogram(color="black", fill="white", binwidth = 300) +
  ggtitle('Distribution of observations per county')


  
# distributions of mean prevalence rates per county
df_us %>% group_by(county_fips) %>% 
  summarise(rate_day = mean(rate_day)) %>%
  ggplot(aes(x=rate_day)) + 
  geom_histogram(color="black", fill="white", binwidth = 0.01) +
  ggtitle('Distribution of mean prevalence rates by county')


  
# distribution of mean sd distance measue
df_us %>% group_by(county_fips) %>% 
  summarise(socdist_tiles = mean(socdist_tiles)) %>%
  ggplot(aes(x=socdist_tiles)) + 
  geom_histogram(color="black", fill="white", bins = 200) +
 ggtitle('Distribution of mean tiles visited measure by county')



# distribution of mean sd visit measue
df_us %>% group_by(county_fips) %>% 
  summarise(socdist_single_tile = mean(socdist_single_tile)) %>%
  ggplot(aes(x=socdist_single_tile)) + 
  geom_histogram(color="black", fill="white", bins = 200) +
  ggtitle('Distribution of mean single tile measute by county')

NA
NA

Plot prevalence over time


df_us %>% sample_n(20000) %>%
  ggplot(aes(x=time, y=rate_day)) + 
  geom_point(aes(col=county_name, size=mark)) + 
  geom_smooth(method="loess", se=T) + 
  theme(legend.position="none") +
  ggtitle("Overall prevalence over time")



pers <- c('pers_o', 'pers_c', 'pers_e', 'pers_a', 'pers_n')

for (i in pers){

gg <- df_us %>% mutate(prev_tail = cut(.[[i]], 
                                       breaks = c(-Inf, quantile(.[[i]], 0.05), quantile(.[[i]], 0.95), Inf),
                                       labels = c('lower tail', 'center', 'upper tail'))) %>% 
  filter(prev_tail != 'center') %>%
  ggplot(aes(x=time, y=rate_day)) + 
  geom_point(aes(col=county_name, size=mark)) + 
  geom_smooth(method="loess", se=T) + 
  facet_wrap(~prev_tail) + 
  theme(legend.position="none") +
  ggtitle(i)

print(gg)
}

Plot social distancing single tile visited


df_us %>% sample_n(10000) %>%
  ggplot(aes(x=time, y=socdist_single_tile)) + 
  geom_point(aes(col=county_name, size=mark)) + 
  geom_smooth(method="loess", se=T) + 
  theme(legend.position="none") +
  ggtitle("Overall social distancing (single tile) over time")


pers <- c('pers_o', 'pers_c', 'pers_e', 'pers_a', 'pers_n')

for (i in pers){

gg <- df_us %>% mutate(dist_tail = cut(.[[i]], 
                                       breaks = c(-Inf, quantile(.[[i]], 0.05), quantile(.[[i]], 0.95), Inf),
                                       labels = c('lower tail', 'center', 'upper tail'))) %>% 
  filter(dist_tail != 'center') %>%
  ggplot(aes(x=time, y=socdist_single_tile)) + 
  geom_point(aes(col=county_name, size=mark)) + 
  geom_smooth(method="loess", se=T) + 
  facet_wrap(~dist_tail) + 
  theme(legend.position="none") +
  ggtitle(i)

print(gg)
}

Control for weekend effect


df_us %>% sample_n(10000) %>%
  ggplot(aes(x=time, y=socdist_single_tile_clean)) + 
  geom_point(aes(col=county_name, size=mark)) + 
  geom_smooth(method="loess", se=T) + 
  theme(legend.position="none") +
  ggtitle("Overall social distancing (single tile) over time")


pers <- c('pers_o', 'pers_c', 'pers_e', 'pers_a', 'pers_n')

for (i in pers){

gg <- df_us %>% mutate(dist_tail = cut(.[[i]], 
                                       breaks = c(-Inf, quantile(.[[i]], 0.05), quantile(.[[i]], 0.95), Inf),
                                       labels = c('lower tail', 'center', 'upper tail'))) %>% 
  filter(dist_tail != 'center') %>%
  ggplot(aes(x=time, y=socdist_single_tile_clean)) + 
  geom_point(aes(col=county_name, size=mark)) + 
  geom_smooth(method="loess", se=T) + 
  facet_wrap(~dist_tail) + 
  theme(legend.position="none") +
  ggtitle(i)

print(gg)
}

Variance over time

Correlations


df_us %>% select(-time, -date, -county_name) %>% 
  group_by(county_fips) %>%
  summarize_if(is.numeric, mean) %>% 
  select(-county_fips) %>%
  cor(use='pairwise.complete.obs') %>% 
  round(3)
                      mark rate_day pers_o pers_c pers_e pers_a pers_n socdist_tiles socdist_single_tile
mark                 1.000    0.189  0.279 -0.052  0.097 -0.041 -0.174        -0.377               0.214
rate_day             0.189    1.000  0.196 -0.049  0.044 -0.037 -0.094        -0.240               0.167
pers_o               0.279    0.196  1.000 -0.052 -0.086 -0.154 -0.228        -0.249               0.208
pers_c              -0.052   -0.049 -0.052  1.000  0.148  0.650 -0.402         0.165              -0.258
pers_e               0.097    0.044 -0.086  0.148  1.000  0.235 -0.386        -0.065              -0.061
pers_a              -0.041   -0.037 -0.154  0.650  0.235  1.000 -0.384         0.123              -0.277
pers_n              -0.174   -0.094 -0.228 -0.402 -0.386 -0.384  1.000         0.062               0.190
socdist_tiles       -0.377   -0.240 -0.249  0.165 -0.065  0.123  0.062         1.000              -0.595
socdist_single_tile  0.214    0.167  0.208 -0.258 -0.061 -0.277  0.190        -0.595               1.000
airport_distance    -0.212   -0.055 -0.093 -0.094 -0.109 -0.103  0.040         0.258              -0.135
republican          -0.345   -0.234 -0.349 -0.046 -0.077 -0.086  0.307         0.350              -0.231
medage              -0.223   -0.075 -0.034 -0.066 -0.092 -0.074  0.232         0.013               0.301
male                -0.115   -0.052 -0.117 -0.096 -0.058 -0.154  0.054         0.126              -0.041
popdens              0.322    0.375  0.222 -0.044  0.028 -0.070 -0.044        -0.244               0.221
manufact            -0.162   -0.138 -0.386  0.070  0.034  0.119  0.179         0.057              -0.126
tourism              0.112    0.111  0.368  0.017 -0.003 -0.067 -0.188        -0.006               0.074
academics            0.418    0.284  0.462 -0.116  0.141 -0.145 -0.341        -0.434               0.259
medinc               0.307    0.228  0.226 -0.173  0.134 -0.219 -0.215        -0.444               0.225
physician_pc        -0.181   -0.100 -0.213  0.107 -0.047  0.112  0.117         0.163              -0.114
                    airport_distance republican medage   male popdens manufact tourism academics medinc
mark                          -0.212     -0.345 -0.223 -0.115   0.322   -0.162   0.112     0.418  0.307
rate_day                      -0.055     -0.234 -0.075 -0.052   0.375   -0.138   0.111     0.284  0.228
pers_o                        -0.093     -0.349 -0.034 -0.117   0.222   -0.386   0.368     0.462  0.226
pers_c                        -0.094     -0.046 -0.066 -0.096  -0.044    0.070   0.017    -0.116 -0.173
pers_e                        -0.109     -0.077 -0.092 -0.058   0.028    0.034  -0.003     0.141  0.134
pers_a                        -0.103     -0.086 -0.074 -0.154  -0.070    0.119  -0.067    -0.145 -0.219
pers_n                         0.040      0.307  0.232  0.054  -0.044    0.179  -0.188    -0.341 -0.215
socdist_tiles                  0.258      0.350  0.013  0.126  -0.244    0.057  -0.006    -0.434 -0.444
socdist_single_tile           -0.135     -0.231  0.301 -0.041   0.221   -0.126   0.074     0.259  0.225
airport_distance               1.000      0.121  0.029  0.194  -0.144   -0.138   0.101    -0.133 -0.177
republican                     0.121      1.000  0.134  0.162  -0.264    0.172  -0.220    -0.452 -0.192
medage                         0.029      0.134  1.000 -0.040  -0.105    0.091  -0.080    -0.210 -0.107
male                           0.194      0.162 -0.040  1.000  -0.101   -0.080  -0.046    -0.175 -0.004
popdens                       -0.144     -0.264 -0.105 -0.101   1.000   -0.120   0.049     0.242  0.154
manufact                      -0.138      0.172  0.091 -0.080  -0.120    1.000  -0.380    -0.385 -0.179
tourism                        0.101     -0.220 -0.080 -0.046   0.049   -0.380   1.000     0.279 -0.022
academics                     -0.133     -0.452 -0.210 -0.175   0.242   -0.385   0.279     1.000  0.719
medinc                        -0.177     -0.192 -0.107 -0.004   0.154   -0.179  -0.022     0.719  1.000
physician_pc                  -0.038      0.196  0.078  0.159  -0.079    0.157  -0.226    -0.369 -0.202
                    physician_pc
mark                      -0.181
rate_day                  -0.100
pers_o                    -0.213
pers_c                     0.107
pers_e                    -0.047
pers_a                     0.112
pers_n                     0.117
socdist_tiles              0.163
socdist_single_tile       -0.114
airport_distance          -0.038
republican                 0.196
medage                     0.078
male                       0.159
popdens                   -0.079
manufact                   0.157
tourism                   -0.226
academics                 -0.369
medinc                    -0.202
physician_pc               1.000
  

Model building

Prepare functions


# function calculates all relevant models
run_models <- function(y, lvl1_x, lvl2_x, lvl2_id, data, ctrls=F){

  # subset data
  data = data %>% 
    dplyr::select(all_of(y), all_of(lvl1_x), all_of(lvl2_x), all_of(lvl2_id), 
                  popdens, rate_day, all_of(y))
  data = data %>% 
    dplyr::rename(y = all_of(y),
           lvl1_x = all_of(lvl1_x),
           lvl2_x = all_of(lvl2_x),
           lvl2_id = all_of(lvl2_id)
           )
  
  # configure optimization procedure
  ctrl_config <- lmeControl(opt = 'optim', maxIter = 100, msMaxIter = 100)

  # baseline
  baseline <- lme(fixed = y ~ 1, random = ~ 1 | lvl2_id, 
                    data = data,
                    correlation = corAR1(),
                  control = ctrl_config,
                  method = 'ML')

  # random intercept fixed slope
  random_intercept <- lme(fixed = y ~ lvl1_x + lvl2_x, 
                          random = ~ 1 | lvl2_id,
                            data = data,
                            correlation = corAR1(),
                  control = ctrl_config,
                  method = 'ML')

  # random intercept random slope
  random_slope <- lme(fixed = y ~ lvl1_x + lvl2_x, 
                      random = ~ lvl1_x | lvl2_id, 
                        data = data,
                        correlation = corAR1(),
                  control = ctrl_config,
                  method = 'ML')

  # cross level interaction
  interaction <- lme(fixed = y ~ lvl1_x * lvl2_x, 
                     random = ~ lvl1_x | lvl2_id, 
                       data = data,
                       correlation = corAR1(),
                  control = ctrl_config,
                  method = 'ML')
  
  # create list with results
  results <- list('baseline' = baseline, 
                  "random_intercept" = random_intercept, 
                  "random_slope" = random_slope,
                  "interaction" = interaction)
  
  
  if (ctrls == 'dem' | ctrls == 'prev'){
    
    # random intercept random slope
    random_slope_ctrl_dem <- lme(fixed = y ~ lvl1_x + lvl2_x + popdens,
                              random = ~ lvl1_x | lvl2_id, 
                          data = data,
                          correlation = corAR1(),
                    control = ctrl_config,
                  method = 'ML')
  
    # cross level interaction
    interaction_ctrl_main_dem <- lme(fixed = y ~ lvl1_x * lvl2_x + popdens,
                             random = ~ lvl1_x | lvl2_id, 
                         data = data,
                         correlation = corAR1(),
                    control = ctrl_config,
                  method = 'ML')
  
    # cross level interaction
    interaction_ctrl_int_dem <- lme(fixed = y ~ lvl1_x * lvl2_x + lvl1_x * popdens,
                             random = ~ lvl1_x | lvl2_id, 
                         data = data,
                         correlation = corAR1(),
                    control = ctrl_config,
                  method = 'ML')        
    
    # create list with results
    results <- list('baseline' = baseline, 
                    "random_intercept" = random_intercept, 
                    "random_slope" = random_slope,
                    "interaction" = interaction,
                    "random_slope_ctrl_dem" = random_slope_ctrl_dem,
                    "interaction_ctrl_main_dem" = interaction_ctrl_main_dem,
                    "interaction_ctrl_int_dem" = interaction_ctrl_int_dem)
  }
  
  if (ctrls == 'prev'){
  
    # random intercept random slope
    random_slope_ctrl_prev <- lme(fixed = y ~ lvl1_x + lvl2_x + popdens + rate_day,
                              random = ~ lvl1_x + rate_day | lvl2_id, 
                          data = data,
                          correlation = corAR1(),
                          control = ctrl_config,
                  method = 'ML')  
    
        # cross level interaction
    interaction_ctrl_main_prev <- lme(fixed = y ~ lvl1_x * lvl2_x + popdens + rate_day,
                             random = ~ lvl1_x | lvl2_id, 
                         data = data,
                         correlation = corAR1(),
                    control = ctrl_config,
                  method = 'ML')
  
  
    # cross level interaction
    interaction_ctrl_int_prev<- lme(fixed = y ~ lvl1_x * lvl2_x + lvl1_x * popdens + rate_day,
                             random = ~ lvl1_x + rate_day | lvl2_id, 
                         data = data,
                         correlation = corAR1(),
                          control = ctrl_config,
                  method = 'ML')
  
    # create list with results
    results <- list('baseline' = baseline, 
                    "random_intercept" = random_intercept, 
                    "random_slope" = random_slope,
                    "interaction" = interaction,
                    "random_slope_ctrl_dem" = random_slope_ctrl_dem,
                    "interaction_ctrl_main_dem" = interaction_ctrl_main_dem,
                    "interaction_ctrl_int_dem" = interaction_ctrl_int_dem,                    
                    "random_slope_ctrl_prev" = random_slope_ctrl_prev,
                    "interaction_ctrl_main_prev" = interaction_ctrl_main_prev,
                    "interaction_ctrl_int_prev" = interaction_ctrl_int_prev)
  }
  
  if(ctrls == 'exp'){
    # random intercept random slope
  random_slope_exp <- lme(fixed = y ~ (lvl1_x + I(lvl1_x^2)) + lvl2_x, 
                      random = ~ (lvl1_x + I(lvl1_x^2)) | lvl2_id, 
                        data = data,
                        correlation = corAR1(),
                  control = ctrl_config,
                  method = 'ML')

  # cross level interaction
  interaction_exp <- lme(fixed = y ~ (lvl1_x + I(lvl1_x^2)) * lvl2_x, 
                     random = ~ (lvl1_x + I(lvl1_x^2)) | lvl2_id, 
                       data = data,
                       correlation = corAR1(),
                  control = ctrl_config,
                  method = 'ML')  
  
  
  # create list with results
  results <- list('baseline' = baseline, 
                  "random_intercept" = random_intercept, 
                  "random_slope" = random_slope,
                  "interaction" = interaction,                  
                  "random_slope_exp" = random_slope_exp,
                  "interaction_exp" = interaction_exp)
  }
  
  return(results)
        
}

# extracts table with coefficients and tests statistics
extract_results <- function(models) {
  
  models_summary <- models %>% 
  map(summary) %>% 
  map("tTable") %>% 
  map(as.data.frame) %>% 
  map(round, 10) 
  # %>% map(~ .[str_detect(rownames(.), 'Inter|lvl'),])
  
  return(models_summary)
  
}


compare_models <- function(models) {

  mdl_names <- models %>% names()
  
  str = ''
  for (i in mdl_names){
    
    mdl_str <- paste('models$', i, sep = '')
    
    if(i == 'baseline'){
      str <- mdl_str
    }else{
    str <- paste(str, mdl_str, sep=', ')
    }
  }
  
  anova_str <- paste0('anova(', str, ')')
  mdl_comp <- eval(parse(text=anova_str))
  rownames(mdl_comp) = mdl_names
  return(mdl_comp)
}

Rescale Data

Predict prevalence

prevalence ~ openness


models_o_covid <-run_models(y = 'rate_day',
                         lvl1_x = 'time',
                         lvl2_x = 'pers_o',
                         lvl2_id = 'county_fips',
                         data = df_us_scaled,
                         ctrls = 'dem')

extract_results(models_o_covid)
$baseline

$random_intercept

$random_slope

$interaction

$random_slope_ctrl_dem

$interaction_ctrl_main_dem

$interaction_ctrl_int_dem
compare_models(models_o_covid)
NA

prevalence ~ conscientiousness


models_c_covid <-run_models(y = 'rate_day', 
                         lvl1_x = 'time', 
                         lvl2_x = 'pers_c', 
                         lvl2_id = 'county_fips', 
                         data = df_us_scaled,
                         ctrls = 'dem')

extract_results(models_c_covid)
$baseline

$random_intercept

$random_slope

$interaction

$random_slope_ctrl_dem

$interaction_ctrl_main_dem

$interaction_ctrl_int_dem
compare_models(models_c_covid)
NA
NA

prevalence ~ extraversion


models_e_covid <-run_models(y = 'rate_day', 
                         lvl1_x = 'time', 
                         lvl2_x = 'pers_e', 
                         lvl2_id = 'county_fips', 
                         data = df_us_scaled,
                         ctrls = 'dem')

extract_results(models_e_covid)
$baseline

$random_intercept

$random_slope

$interaction

$random_slope_ctrl_dem

$interaction_ctrl_main_dem

$interaction_ctrl_int_dem
compare_models(models_e_covid)
NA
NA

prevalence ~ agreeableness


models_a_covid <-run_models(y = 'rate_day', 
                         lvl1_x = 'time', 
                         lvl2_x = 'pers_a', 
                         lvl2_id = 'county_fips', 
                         data = df_us_scaled,
                         ctrls = 'dem')

extract_results(models_a_covid)
$baseline

$random_intercept

$random_slope

$interaction

$random_slope_ctrl_dem

$interaction_ctrl_main_dem

$interaction_ctrl_int_dem
compare_models(models_a_covid)
NA
NA

prevalence ~ neuroticism


models_n_covid <-run_models(y = 'rate_day', 
                         lvl1_x = 'time', 
                         lvl2_x = 'pers_n', 
                         lvl2_id = 'county_fips', 
                         data = df_us_scaled,
                         ctrls = 'dem')

extract_results(models_n_covid)
$baseline

$random_intercept

$random_slope

$interaction

$random_slope_ctrl_dem

$interaction_ctrl_main_dem

$interaction_ctrl_int_dem
compare_models(models_n_covid)
NA
NA

Predict social distancing

social distancing ~ openness


models_o_sd <-run_models(y = 'socdist_single_tile', 
                         lvl1_x = 'time', 
                         lvl2_x = 'pers_o', 
                         lvl2_id = 'county_fips', 
                         data = df_us_scaled,
                         ctrls = 'prev')

extract_results(models_o_sd)
$baseline

$random_intercept

$random_slope

$interaction

$random_slope_ctrl_dem

$interaction_ctrl_main_dem

$interaction_ctrl_int_dem

$random_slope_ctrl_prev

$interaction_ctrl_main_prev

$interaction_ctrl_int_prev
compare_models(models_o_sd)
NA
NA

social distancing ~ conscientiousness


models_c_sd <-run_models(y = 'socdist_single_tile', 
                         lvl1_x = 'time', 
                         lvl2_x = 'pers_c', 
                         lvl2_id = 'county_fips', 
                         data = df_us_scaled,
                         ctrls = 'prev')

extract_results(models_c_sd)
$baseline

$random_intercept

$random_slope

$interaction

$random_slope_ctrl_dem

$interaction_ctrl_main_dem

$interaction_ctrl_int_dem

$random_slope_ctrl_prev

$interaction_ctrl_main_prev

$interaction_ctrl_int_prev
compare_models(models_c_sd)
NA
NA

social distancing ~ extraversion


models_e_sd <-run_models(y = 'socdist_single_tile', 
                         lvl1_x = 'time', 
                         lvl2_x = 'pers_e', 
                         lvl2_id = 'county_fips', 
                         data = df_us_scaled,
                         ctrls = 'prev')

extract_results(models_e_sd)
$baseline

$random_intercept

$random_slope

$interaction

$random_slope_ctrl_dem

$interaction_ctrl_main_dem

$interaction_ctrl_int_dem

$random_slope_ctrl_prev

$interaction_ctrl_main_prev

$interaction_ctrl_int_prev
compare_models(models_e_sd)
NA
NA

social distancing ~ agreeableness


models_a_sd <-run_models(y = 'socdist_single_tile', 
                         lvl1_x = 'time', 
                         lvl2_x = 'pers_a', 
                         lvl2_id = 'county_fips', 
                         data = df_us_scaled,
                         ctrls = 'prev')

extract_results(models_a_sd)
$baseline

$random_intercept

$random_slope

$interaction

$random_slope_ctrl_dem

$interaction_ctrl_main_dem

$interaction_ctrl_int_dem

$random_slope_ctrl_prev

$interaction_ctrl_main_prev

$interaction_ctrl_int_prev
compare_models(models_a_sd)
NA
NA

social distancing ~ neuroticism


models_n_sd <-run_models(y = 'socdist_single_tile', 
                         lvl1_x = 'time', 
                         lvl2_x = 'pers_n', 
                         lvl2_id = 'county_fips', 
                         data = df_us_scaled,
                         ctrls = 'prev')

extract_results(models_n_sd)
$baseline

$random_intercept

$random_slope

$interaction

$random_slope_ctrl_dem

$interaction_ctrl_main_dem

$interaction_ctrl_int_dem

$random_slope_ctrl_prev

$interaction_ctrl_main_prev

$interaction_ctrl_int_prev
compare_models(models_n_sd)
NA

Create overview table

Define function to create overview tables


summary_table <- function(models, dv_name, prev=F){

  temp_df_ctrl_main <- NULL
  temp_df_ctrl_int <- NULL
  temp_df_ctrl_int_prev <- NULL
  
  for (i in models){
    results <- i %>% extract_results()
    
    results_ctrl_main <- results$interaction_ctrl_main_dem['lvl1_x:lvl2_x',]
    temp_df_ctrl_main <- temp_df_ctrl_main %>% rbind(results_ctrl_main)
    
    results_ctrl_int <- results$interaction_ctrl_int_dem['lvl1_x:lvl2_x',]
    temp_df_ctrl_int <- temp_df_ctrl_int %>% rbind(results_ctrl_int)
    
    if(prev){
      results_ctrl_int_prev <- results$interaction_ctrl_int_prev['lvl1_x:lvl2_x',]
      temp_df_ctrl_int_prev <- temp_df_ctrl_int_prev %>% rbind(results_ctrl_int_prev)
    }
        
  }
  
  names_ctrl_main <- paste0(dv_name, '~', c('o', 'c', 'e', 'a', 'n'), '*time', '_crtl_popdens')
  rownames(temp_df_ctrl_main) <- names_ctrl_main

  names_ctrl_int <- paste0(dv_name, '~', c('o', 'c', 'e', 'a', 'n'), '*time', '_crtl_popdens*time')
  rownames(temp_df_ctrl_int) <- names_ctrl_int

  if(prev){
    names_ctrl_int_prev <- paste0(dv_name, '~', c('o', 'c', 'e', 'a', 'n'), '*time', '_crtl_popdens*time_prev')
    rownames(temp_df_ctrl_int_prev) <- names_ctrl_int_prev
    
    sum_tab <- rbind(temp_df_ctrl_main, temp_df_ctrl_int, temp_df_ctrl_int_prev) %>% round(4)
  }else{
    sum_tab <- rbind(temp_df_ctrl_main, temp_df_ctrl_int) %>% round(4)
  }


  
  return(sum_tab)

} 

Create overview tables

# prevalence
models_prev <- list(models_o_covid, 
                    models_c_covid, 
                    models_e_covid, 
                    models_a_covid, 
                    models_n_covid)

sum_tab_prev <- summary_table(models_prev, dv_name = 'prev')

write.table(sum_tab_prev, quote=F)
Value Std.Error DF t-value p-value
prev~o*time_crtl_popdens 0.0244 0.0027 53135 9.1918 0
prev~c*time_crtl_popdens -0.0061 0.0027 53135 -2.266 0.0235
prev~e*time_crtl_popdens 0.0055 0.0027 53135 2.0302 0.0423
prev~a*time_crtl_popdens -0.0042 0.0027 53135 -1.572 0.116
prev~n*time_crtl_popdens -0.0109 0.0027 53135 -4.0489 1e-04
prev~o*time_crtl_popdens*time 0.013 0.0025 53134 5.1452 0
prev~c*time_crtl_popdens*time -0.0037 0.0025 53134 -1.5004 0.1335
prev~e*time_crtl_popdens*time 0.004 0.0025 53134 1.6065 0.1082
prev~a*time_crtl_popdens*time -4e-04 0.0025 53134 -0.179 0.858
prev~n*time_crtl_popdens*time -0.0085 0.0025 53134 -3.4502 6e-04
# social distancing
models_socdist <- list(models_o_sd, 
                       models_c_sd, 
                       models_e_sd, 
                       models_a_sd, 
                       models_n_sd)

sum_tab_socdist <- summary_table(models_socdist, dv_name = 'socdist', prev=T)

write.table(sum_tab_socdist, quote=F)
Value Std.Error DF t-value p-value
socdist~o*time_crtl_popdens 0.012 7e-04 53135 18.0098 0
socdist~c*time_crtl_popdens -0.006 7e-04 53135 -8.6385 0
socdist~e*time_crtl_popdens 0.0025 7e-04 53135 3.5181 4e-04
socdist~a*time_crtl_popdens -0.0067 7e-04 53135 -9.7575 0
socdist~n*time_crtl_popdens -0.0033 7e-04 53135 -4.5664 0
socdist~o*time_crtl_popdens*time 0.0099 7e-04 53134 14.8225 0
socdist~c*time_crtl_popdens*time -0.0055 7e-04 53134 -8.2817 0
socdist~e*time_crtl_popdens*time 0.0021 7e-04 53134 3.2025 0.0014
socdist~a*time_crtl_popdens*time -0.006 7e-04 53134 -9.0111 0
socdist~n*time_crtl_popdens*time -0.0028 7e-04 53134 -4.0934 0
socdist~o*time_crtl_popdens*time_prev 0.0102 7e-04 53133 15.0292 0
socdist~c*time_crtl_popdens*time_prev -0.0054 7e-04 53133 -8.1668 0
socdist~e*time_crtl_popdens*time_prev 0.0022 7e-04 53133 3.2724 0.0011
socdist~a*time_crtl_popdens*time_prev -0.0059 7e-04 53133 -8.8441 0
socdist~n*time_crtl_popdens*time_prev -0.0031 7e-04 53133 -4.5123 0

Conditional random forest analysis

Extract slopes


# slope prevalence
df_us_slope_prev <- df_us_scaled %>% split(.$county) %>% 
  map(~ lm(rate_day ~ time, data = .)) %>%
  map(coef) %>% 
  map_dbl('time') %>% 
  as.data.frame() %>% 
  rownames_to_column('county_fips') %>% 
  rename(slope_prev = '.')

df_us_slope_prev <- df_us_scaled %>% select(-time, -rate_day, -socdist_single_tile) %>%
  distinct() %>% 
  mutate(county_fips = as.character(county_fips)) %>%
  inner_join(df_us_slope_prev, by = 'county_fips') %>%
  drop_na()



# slope social distancing
df_us_slope_socdist <- df_us_scaled %>% split(.$county) %>% 
  map(~ lm(socdist_single_tile ~ time, data = .)) %>%
  map(coef) %>% 
  map_dbl('time') %>% 
  as.data.frame() %>% 
  rownames_to_column('county_fips') %>% 
  rename(slope_socdist = '.')

df_us_slope_socdist <- df_us_scaled %>% select(-time, -rate_day, -socdist_single_tile) %>%
  distinct() %>% 
  mutate(county_fips = as.character(county_fips)) %>%
  inner_join(df_us_slope_socdist, by = 'county_fips') %>%
  drop_na()

df_us_slope_socdist

Explore distribution of slopes

df_us_slope_prev %>% ggplot(aes(slope_prev)) + geom_histogram(bins = 100)


df_us_slope_socdist %>% ggplot(aes(slope_socdist)) + geom_histogram(bins = 100)

CRF prevalence ~ openness


ctrls <- cforest_unbiased(ntree=500, mtry=5)

crf_o_fit_prev <- cforest(slope_prev ~ pers_o + airport_distance + republican +
                          medage + male + popdens + manufact +
                          tourism + academics + medinc + physician_pc, 
                         df_us_slope_prev[-1], 
                         controls = ctrls)

crf_o_varimp_prev <- varimp(crf_o_fit_prev, nperm = 1)
crf_o_varimp_cond_prev <- varimp(crf_o_fit_prev, conditional = T, nperm = 1)

crf_o_varimp_prev %>% as.data.frame() %>% rownames_to_column('variable') %>%
  ggplot(aes(x=variable, y=.)) +
  geom_bar(stat = 'identity') + 
  theme(axis.text.x = element_text(angle = 90))


crf_o_varimp_cond_prev %>% as.data.frame() %>% rownames_to_column('variable') %>%
  ggplot(aes(x=variable, y=.)) +
  geom_bar(stat = 'identity') +
  theme(axis.text.x = element_text(angle = 90))

CRF prevalence ~ conscientiousness


ctrls <- cforest_unbiased(ntree=500, mtry=5)

crf_c_fit_prev <- cforest(slope_prev ~ pers_c + airport_distance + republican +
                          medage + male + popdens + manufact +
                          tourism + academics + medinc + physician_pc, 
                         df_us_slope_prev[-1], 
                         controls = ctrls)

crf_c_varimp_prev <- varimp(crf_c_fit_prev, nperm = 1)
crf_c_varimp_cond_prev <- varimp(crf_c_fit_prev, conditional = T, nperm = 1)

crf_c_varimp_prev %>% as.data.frame() %>% rownames_to_column('variable') %>%
  ggplot(aes(x=variable, y=.)) +
  geom_bar(stat = 'identity') + 
  theme(axis.text.x = element_text(angle = 90))


crf_c_varimp_cond_prev %>% as.data.frame() %>% rownames_to_column('variable') %>%
  ggplot(aes(x=variable, y=.)) +
  geom_bar(stat = 'identity') + 
  theme(axis.text.x = element_text(angle = 90))

CRF prevalence ~ extraversion


ctrls <- cforest_unbiased(ntree=500, mtry=5)

crf_e_fit_prev <- cforest(slope_prev ~ pers_e + airport_distance + republican +
                          medage + male + popdens + manufact +
                          tourism + academics + medinc + physician_pc, 
                         df_us_slope_prev[-1], 
                         controls = ctrls)

crf_e_varimp_prev <- varimp(crf_e_fit_prev, nperm = 1)
crf_e_varimp_cond_prev <- varimp(crf_e_fit_prev, conditional = T, nperm = 1)

crf_e_varimp_prev %>% as.data.frame() %>% rownames_to_column('variable') %>%
  ggplot(aes(x=variable, y=.)) +
  geom_bar(stat = 'identity') + 
  theme(axis.text.x = element_text(angle = 90))


crf_e_varimp_cond_prev %>% as.data.frame() %>% rownames_to_column('variable') %>%
  ggplot(aes(x=variable, y=.)) +
  geom_bar(stat = 'identity') + 
  theme(axis.text.x = element_text(angle = 90))

CRF prevalence ~ agreeableness


ctrls <- cforest_unbiased(ntree=500, mtry=5)

crf_a_fit_prev <- cforest(slope_prev ~ pers_a + airport_distance + republican +
                          medage + male + popdens + manufact +
                          tourism + academics + medinc + physician_pc, 
                         df_us_slope_prev[-1], 
                         controls = ctrls)

crf_a_varimp_prev <- varimp(crf_a_fit_prev, nperm = 1)
crf_a_varimp_cond_prev <- varimp(crf_a_fit_prev, conditional = T, nperm = 1)

crf_a_varimp_prev %>% as.data.frame() %>% rownames_to_column('variable') %>%
  ggplot(aes(x=variable, y=.)) +
  geom_bar(stat = 'identity') + 
  theme(axis.text.x = element_text(angle = 90))


crf_a_varimp_cond_prev %>% as.data.frame() %>% rownames_to_column('variable') %>%
  ggplot(aes(x=variable, y=.)) +
  geom_bar(stat = 'identity') + 
  theme(axis.text.x = element_text(angle = 90))

CRF prevalence ~ neuroticism


ctrls <- cforest_unbiased(ntree=500, mtry=5)

crf_n_fit_prev <- cforest(slope_prev ~ pers_n + airport_distance + republican +
                          medage + male + popdens + manufact +
                          tourism + academics + medinc + physician_pc, 
                         df_us_slope_prev[-1], 
                         controls = ctrls)

crf_n_varimp_prev <- varimp(crf_n_fit_prev, nperm = 1)
crf_n_varimp_cond_prev <- varimp(crf_n_fit_prev, conditional = T, nperm = 1)

crf_n_varimp_prev %>% as.data.frame() %>% rownames_to_column('variable') %>%
  ggplot(aes(x=variable, y=.)) +
  geom_bar(stat = 'identity') + 
  theme(axis.text.x = element_text(angle = 90))


crf_n_varimp_cond_prev %>% as.data.frame() %>% rownames_to_column('variable') %>%
  ggplot(aes(x=variable, y=.)) +
  geom_bar(stat = 'identity') + 
  theme(axis.text.x = element_text(angle = 90))

CRF social distancing ~ openness


ctrls <- cforest_unbiased(ntree=500, mtry=5)

crf_o_fit_socdist <- cforest(slope_socdist ~ pers_o + airport_distance + republican +
                          medage + male + popdens + manufact +
                          tourism + academics + medinc + physician_pc, 
                         df_us_slope_socdist[-1], 
                         controls = ctrls)

crf_o_varimp_socdist <- varimp(crf_o_fit_socdist, nperm = 1)
crf_o_varimp_cond_socdist <- varimp(crf_o_fit_socdist, conditional = T, nperm = 1)

crf_o_varimp_socdist %>% as.data.frame() %>% rownames_to_column('variable') %>%
  ggplot(aes(x=variable, y=.)) +
  geom_bar(stat = 'identity') + 
  theme(axis.text.x = element_text(angle = 90))


crf_o_varimp_cond_socdist %>% as.data.frame() %>% rownames_to_column('variable') %>%
  ggplot(aes(x=variable, y=.)) +
  geom_bar(stat = 'identity') + 
  theme(axis.text.x = element_text(angle = 90))

CRF social distancing ~ conscientiousness


ctrls <- cforest_unbiased(ntree=500, mtry=5)

crf_c_fit_socdist <- cforest(slope_socdist ~ pers_c + airport_distance + republican +
                          medage + male + popdens + manufact +
                          tourism + academics + medinc + physician_pc, 
                         df_us_slope_socdist[-1], 
                         controls = ctrls)

crf_c_varimp_socdist <- varimp(crf_c_fit_socdist, nperm = 1)
crf_c_varimp_cond_socdist <- varimp(crf_c_fit_socdist, conditional = T, nperm = 1)

crf_c_varimp_socdist %>% as.data.frame() %>% rownames_to_column('variable') %>%
  ggplot(aes(x=variable, y=.)) +
  geom_bar(stat = 'identity') + 
  theme(axis.text.x = element_text(angle = 90))


crf_c_varimp_cond_socdist %>% as.data.frame() %>% rownames_to_column('variable') %>%
  ggplot(aes(x=variable, y=.)) +
  geom_bar(stat = 'identity') + 
  theme(axis.text.x = element_text(angle = 90))

CRF social distancing ~ extraversion


ctrls <- cforest_unbiased(ntree=500, mtry=5)

crf_e_fit_socdist <- cforest(slope_socdist ~ pers_e + airport_distance + republican +
                          medage + male + popdens + manufact +
                          tourism + academics + medinc + physician_pc, 
                         df_us_slope_socdist[-1], 
                         controls = ctrls)

crf_e_varimp_socdist <- varimp(crf_e_fit_socdist, nperm = 1)
crf_e_varimp_cond_socdist <- varimp(crf_e_fit_socdist, conditional = T, nperm = 1)

crf_e_varimp_socdist %>% as.data.frame() %>% rownames_to_column('variable') %>%
  ggplot(aes(x=variable, y=.)) +
  geom_bar(stat = 'identity') + 
  theme(axis.text.x = element_text(angle = 90))


crf_e_varimp_cond_socdist %>% as.data.frame() %>% rownames_to_column('variable') %>%
  ggplot(aes(x=variable, y=.)) +
  geom_bar(stat = 'identity') + 
  theme(axis.text.x = element_text(angle = 90))

CRF social distancing ~ agreeableness


ctrls <- cforest_unbiased(ntree=500, mtry=5)

crf_a_fit_socdist <- cforest(slope_socdist ~ pers_a + airport_distance + republican +
                          medage + male + popdens + manufact +
                          tourism + academics + medinc + physician_pc, 
                         df_us_slope_socdist[-1], 
                         controls = ctrls)

crf_a_varimp_socdist <- varimp(crf_a_fit_socdist, nperm = 1)
crf_a_varimp_cond_socdist <- varimp(crf_a_fit_socdist, conditional = T, nperm = 1)

crf_a_varimp_socdist %>% as.data.frame() %>% rownames_to_column('variable') %>%
  ggplot(aes(x=variable, y=.)) +
  geom_bar(stat = 'identity') + 
  theme(axis.text.x = element_text(angle = 90))


crf_a_varimp_cond_socdist %>% as.data.frame() %>% rownames_to_column('variable') %>%
  ggplot(aes(x=variable, y=.)) +
  geom_bar(stat = 'identity') + 
  theme(axis.text.x = element_text(angle = 90))

CRF social distancing ~ neuroticism


ctrls <- cforest_unbiased(ntree=500, mtry=5)

crf_n_fit_socdist <- cforest(slope_socdist ~ pers_n + airport_distance + republican +
                          medage + male + popdens + manufact +
                          tourism + academics + medinc + physician_pc, 
                         df_us_slope_socdist[-1], 
                         controls = ctrls)

crf_n_varimp_socdist <- varimp(crf_n_fit_socdist, nperm = 1)
crf_n_varimp_cond_socdist <- varimp(crf_n_fit_socdist, conditional = T, nperm = 1)

crf_n_varimp_socdist %>% as.data.frame() %>% rownames_to_column('variable') %>%
  ggplot(aes(x=variable, y=.)) +
  geom_bar(stat = 'identity') + 
  theme(axis.text.x = element_text(angle = 90))


crf_n_varimp_cond_socdist %>% as.data.frame() %>% rownames_to_column('variable') %>%
  ggplot(aes(x=variable, y=.)) +
  geom_bar(stat = 'identity') + 
  theme(axis.text.x = element_text(angle = 90))

Linear models predicting slopes from personality


lm_slope_prev_pers <- lm(slope_prev ~ pers_o + pers_c + pers_e + pers_a + pers_n, 
                         data = df_us_slope_prev)
lm_slope_prev_pers %>% summary()

Call:
lm(formula = slope_prev ~ pers_o + pers_c + pers_e + pers_a + 
    pers_n, data = df_us_slope_prev)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.11019 -0.02930 -0.01466  0.00224  1.64943 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.027854   0.001886  14.769  < 2e-16 ***
pers_o       0.017433   0.002045   8.525  < 2e-16 ***
pers_c      -0.006911   0.002553  -2.707  0.00683 ** 
pers_e       0.004472   0.002094   2.135  0.03282 *  
pers_a       0.000545   0.002601   0.210  0.83404    
pers_n      -0.005611   0.002374  -2.363  0.01819 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.09289 on 2420 degrees of freedom
Multiple R-squared:  0.04663,   Adjusted R-squared:  0.04466 
F-statistic: 23.67 on 5 and 2420 DF,  p-value: < 2.2e-16
lm_slope_socdist_pers <- lm(slope_socdist ~ pers_o + pers_c + pers_e + pers_a + pers_n, 
                            data = df_us_slope_socdist)
lm_slope_socdist_pers %>% summary()

Call:
lm(formula = slope_socdist ~ pers_o + pers_c + pers_e + pers_a + 
    pers_n, data = df_us_slope_socdist)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.28392 -0.02426 -0.00637  0.01813  1.65694 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.104604   0.001132  92.426  < 2e-16 ***
pers_o       0.008687   0.001227   7.079 1.89e-12 ***
pers_c      -0.004476   0.001532  -2.922 0.003512 ** 
pers_e       0.003246   0.001257   2.583 0.009841 ** 
pers_a      -0.003405   0.001561  -2.181 0.029245 *  
pers_n      -0.004956   0.001425  -3.479 0.000513 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.05574 on 2420 degrees of freedom
Multiple R-squared:  0.04961,   Adjusted R-squared:  0.04764 
F-statistic: 25.26 on 5 and 2420 DF,  p-value: < 2.2e-16

Linear models predicting slopes with controls


lm_slope_prev_pers <- lm(slope_prev ~ pers_o + pers_c + pers_e + pers_a + pers_n + 
                           airport_distance + republican + medage + male + popdens + 
                           manufact + tourism + academics + medinc + physician_pc,
                         data = df_us_slope_prev)
lm_slope_prev_pers %>% summary()

Call:
lm(formula = slope_prev ~ pers_o + pers_c + pers_e + pers_a + 
    pers_n + airport_distance + republican + medage + male + 
    popdens + manufact + tourism + academics + medinc + physician_pc, 
    data = df_us_slope_prev)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.31430 -0.02291 -0.00994  0.00277  1.59045 

Coefficients:
                   Estimate Std. Error t value Pr(>|t|)    
(Intercept)       2.790e-02  1.713e-03  16.285  < 2e-16 ***
pers_o            2.761e-03  2.187e-03   1.263 0.206874    
pers_c           -3.284e-03  2.360e-03  -1.391 0.164245    
pers_e            1.138e-03  1.932e-03   0.589 0.555932    
pers_a            5.501e-03  2.476e-03   2.221 0.026416 *  
pers_n            2.122e-03  2.359e-03   0.900 0.368348    
airport_distance  2.754e-03  1.865e-03   1.477 0.139782    
republican       -7.602e-03  2.085e-03  -3.646 0.000272 ***
medage           -6.650e-05  1.807e-03  -0.037 0.970654    
male             -2.818e-05  1.876e-03  -0.015 0.988018    
popdens           3.369e-02  1.832e-03  18.395  < 2e-16 ***
manufact         -2.120e-03  2.059e-03  -1.030 0.303341    
tourism           5.024e-03  2.081e-03   2.414 0.015849 *  
academics         4.209e-03  3.415e-03   1.232 0.217913    
medinc            1.237e-02  2.824e-03   4.379 1.24e-05 ***
physician_pc      6.768e-04  1.889e-03   0.358 0.720186    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.08439 on 2410 degrees of freedom
Multiple R-squared:  0.2164,    Adjusted R-squared:  0.2115 
F-statistic: 44.37 on 15 and 2410 DF,  p-value: < 2.2e-16
lm_slope_socdist_pers <- lm(slope_socdist ~ pers_o + pers_c + pers_e + pers_a + pers_n + 
                              airport_distance + republican + medage + male + popdens + 
                              manufact + tourism + academics + medinc + physician_pc,
                            data = df_us_slope_socdist)
lm_slope_socdist_pers %>% summary()

Call:
lm(formula = slope_socdist ~ pers_o + pers_c + pers_e + pers_a + 
    pers_n + airport_distance + republican + medage + male + 
    popdens + manufact + tourism + academics + medinc + physician_pc, 
    data = df_us_slope_socdist)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.26752 -0.01730 -0.00324  0.01248  1.63123 

Coefficients:
                   Estimate Std. Error t value Pr(>|t|)    
(Intercept)       0.1046597  0.0010362 101.000  < 2e-16 ***
pers_o           -0.0003449  0.0013227  -0.261 0.794320    
pers_c           -0.0010022  0.0014272  -0.702 0.482618    
pers_e            0.0004776  0.0011681   0.409 0.682670    
pers_a            0.0007837  0.0014976   0.523 0.600800    
pers_n            0.0023991  0.0014266   1.682 0.092753 .  
airport_distance -0.0037722  0.0011277  -3.345 0.000835 ***
republican       -0.0109897  0.0012610  -8.715  < 2e-16 ***
medage            0.0058495  0.0010931   5.351 9.55e-08 ***
male              0.0053010  0.0011348   4.671 3.16e-06 ***
popdens           0.0055870  0.0011077   5.044 4.91e-07 ***
manufact          0.0023425  0.0012451   1.881 0.060040 .  
tourism           0.0039623  0.0012585   3.148 0.001662 ** 
academics         0.0056901  0.0020653   2.755 0.005912 ** 
medinc            0.0131087  0.0017081   7.675 2.39e-14 ***
physician_pc     -0.0029367  0.0011424  -2.570 0.010215 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.05104 on 2410 degrees of freedom
Multiple R-squared:  0.2067,    Adjusted R-squared:  0.2017 
F-statistic: 41.85 on 15 and 2410 DF,  p-value: < 2.2e-16
gg <- gg + geom_map(data=us_map_shape, map=us_map_shape,
                    aes(x = x, y = y, map_id=fips),
                    color="black", fill="white", size=0.25)
Error: `map` must have the columns `x`, `y`, and `id`
df_us_covid %>% filter(county == 23013)

Change point analysis

library(changepoint)
Successfully loaded changepoint package version 2.2.2
 NOTE: Predefined penalty values changed in version 2.2.  Previous penalty values with a postfix 1 i.e. SIC1 are now without i.e. SIC and previous penalties without a postfix i.e. SIC are now with a postfix 0 i.e. SIC0. See NEWS and help files for further details.

Preparation

# keep only counties with full data
fips_complete <- df_us_scaled %>% 
  group_by(county_fips) %>% 
  summarize(n = n()) %>% 
  filter(n==max(.$n)) %>% 
  .$county_fips

Prevalence


# run changepoint analysis
df_us_prev_cpt_results <- df_us_scaled %>% select(county_fips, rate_day) %>%
  filter(county_fips %in% fips_complete) %>% 
  split(.$county_fips) %>%
  map(~ cpt.meanvar(as.vector(.$rate_day),
                    class=TRUE,
                    param.estimates=TRUE,
                    Q=1))

# calculate change point
df_us_prev_cpt_day <- df_us_prev_cpt_results %>% 
  map(cpts) %>% 
  unlist() %>% 
  as.data.frame() %>% 
  rename(cpt_day_prev = '.') %>%
  rownames_to_column('county_fips')

# calculate mean differences
df_us_prev_cpt_mean_diff <- df_us_prev_cpt_results %>% 
  map(param.est) %>% 
  map(~ .$mean) %>% 
  map(~ .[2]-.[1]) %>% 
  unlist() %>% 
  as.data.frame() %>% 
  rename(mean_diff_prev = '.') %>%
  rownames_to_column('county_fips')

# calculate varaince differences
df_us_prev_cpt_var_diff <- df_us_prev_cpt_results %>% 
  map(param.est) %>% 
  map(~ .$variance) %>% 
  map(~ .[2]-.[1]) %>% 
  unlist() %>% 
  as.data.frame() %>% 
  rename(var_diff_prev = '.') %>%
  rownames_to_column('county_fips')

# merge new variables 
df_us_cpt_prev <- df_us_scaled %>%
  select(-time, -rate_day, -socdist_single_tile, -socdist_single_tile_clean) %>%
  distinct() %>%
  mutate(county_fips = as.character(county_fips)) %>%
  left_join(df_us_prev_cpt_day, by='county_fips') %>%
  left_join(df_us_prev_cpt_mean_diff, by='county_fips') %>%
  left_join(df_us_prev_cpt_var_diff, by='county_fips')

df_us_cpt_prev %>% select(cpt_day_prev) %>% map(hist)
$cpt_day_prev
$breaks
 [1]  2  4  6  8 10 12 14 16 18 20 22

$counts
 [1]  10  32 148   4 189 674   9   7  10  78

$density
 [1] 0.004306632 0.013781223 0.063738157 0.001722653 0.081395349 0.290267011 0.003875969 0.003014643
 [9] 0.004306632 0.033591731

$mids
 [1]  3  5  7  9 11 13 15 17 19 21

$xname
[1] ".x[[i]]"

$equidist
[1] TRUE

attr(,"class")
[1] "histogram"

df_us_cpt_prev %>% select(mean_diff_prev) %>% map(hist)
$mean_diff_prev
$breaks
 [1]  0  2  4  6  8 10 12 14 16 18 20 22 24 26

$counts
 [1] 1106   34    7    5    2    3    3    0    0    0    0    0    1

$density
 [1] 0.4763135228 0.0146425495 0.0030146425 0.0021533161 0.0008613264 0.0012919897 0.0012919897
 [8] 0.0000000000 0.0000000000 0.0000000000 0.0000000000 0.0000000000 0.0004306632

$mids
 [1]  1  3  5  7  9 11 13 15 17 19 21 23 25

$xname
[1] ".x[[i]]"

$equidist
[1] TRUE

attr(,"class")
[1] "histogram"

df_us_cpt_prev %>% select(var_diff_prev) %>% map(hist)
$var_diff_prev
$breaks
 [1] -20   0  20  40  60  80 100 120 140 160 180 200 220 240 260 280 300

$counts
 [1]   59 1007    4    0    3    4    6    0    0    0    0    0    0    0    0    1

$density
 [1] 2.721402e-03 4.644834e-02 1.845018e-04 0.000000e+00 1.383764e-04 1.845018e-04 2.767528e-04
 [8] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
[15] 0.000000e+00 4.612546e-05

$mids
 [1] -10  10  30  50  70  90 110 130 150 170 190 210 230 250 270 290

$xname
[1] ".x[[i]]"

$equidist
[1] TRUE

attr(,"class")
[1] "histogram"

df_us_cpt_prev %>% dim()
[1] 2397   19
df_us_cpt_prev %>% drop_na() %>% dim()
[1] 1080   19

for(i in head(df_us_prev_cpt_results,18)){
  plot(i)
}

NA

Socdist


# run changepoint analysis
df_us_socdist_cpt_results <- df_us_scaled %>% select(county_fips, socdist_single_tile_clean) %>%
  filter(county_fips %in% fips_complete) %>% 
  split(.$county_fips) %>%
  map(~ cpt.meanvar(as.vector(.$socdist_single_tile_clean),
                    #penalty = 'Asymptotic',
                    class=TRUE,
                    param.estimates=TRUE,
                    Q=1,
                    test.stat = 'Normal'))

# calculate change point
df_us_socdist_cpt_day <- df_us_socdist_cpt_results %>% 
  map(cpts) %>% 
  unlist() %>% 
  as.data.frame() %>% 
  rename(cpt_day_socdist = '.') %>%
  rownames_to_column('county_fips')

# calculate mean differences
df_us_socdist_cpt_mean_diff <- df_us_socdist_cpt_results %>% 
  map(param.est) %>% 
  map(~ .$mean) %>% 
  map(~ .[2]-.[1]) %>% 
  unlist() %>% 
  as.data.frame() %>% 
  rename(mean_diff_socdist = '.') %>%
  rownames_to_column('county_fips')

# calculate varaince differences
df_us_socdist_cpt_var_diff <- df_us_socdist_cpt_results %>% 
  map(param.est) %>% 
  map(~ .$variance) %>% 
  map(~ .[2]-.[1]) %>% 
  unlist() %>% 
  as.data.frame() %>% 
  rename(var_diff_socdist = '.') %>%
  rownames_to_column('county_fips')

# merge new variables 
df_us_cpt_prev_socdist <- df_us_cpt_prev %>%
  left_join(df_us_socdist_cpt_day, by='county_fips') %>%
  left_join(df_us_socdist_cpt_mean_diff, by='county_fips') %>%
  left_join(df_us_socdist_cpt_var_diff, by='county_fips')

df_us_cpt_prev_socdist %>% select(cpt_day_socdist) %>% map(hist)
$cpt_day_socdist
$breaks
 [1]  2  4  6  8 10 12 14 16 18 20 22

$counts
 [1]  11 116 564 592 591  76 106  12   2 327

$density
 [1] 0.0022945348 0.0241969128 0.1176470588 0.1234876929 0.1232790989 0.0158531498 0.0221109720
 [8] 0.0025031289 0.0004171882 0.0682102628

$mids
 [1]  3  5  7  9 11 13 15 17 19 21

$xname
[1] ".x[[i]]"

$equidist
[1] TRUE

attr(,"class")
[1] "histogram"

df_us_cpt_prev_socdist %>% select(mean_diff_socdist) %>% map(hist)
$mean_diff_socdist
$breaks
 [1] -0.5  0.0  0.5  1.0  1.5  2.0  2.5  3.0  3.5  4.0  4.5  5.0

$counts
 [1]    6   42  337 1043  667  236   53    8    3    1    1

$density
 [1] 0.0050062578 0.0350438048 0.2811848144 0.8702544848 0.5565289946 0.1969128077 0.0442219441
 [8] 0.0066750104 0.0025031289 0.0008343763 0.0008343763

$mids
 [1] -0.25  0.25  0.75  1.25  1.75  2.25  2.75  3.25  3.75  4.25  4.75

$xname
[1] ".x[[i]]"

$equidist
[1] TRUE

attr(,"class")
[1] "histogram"

df_us_cpt_prev_socdist %>% select(var_diff_socdist) %>% map(hist)
$var_diff_socdist
$breaks
 [1] -1.4 -1.2 -1.0 -0.8 -0.6 -0.4 -0.2  0.0  0.2  0.4  0.6  0.8  1.0  1.2  1.4  1.6  1.8

$counts
 [1]   2   4  10  24  77 171 757 762 163  52  14  22  10   1   2   1

$density
 [1] 0.004826255 0.009652510 0.024131274 0.057915058 0.185810811 0.412644788 1.826737452 1.838803089
 [9] 0.393339768 0.125482625 0.033783784 0.053088803 0.024131274 0.002413127 0.004826255 0.002413127

$mids
 [1] -1.3 -1.1 -0.9 -0.7 -0.5 -0.3 -0.1  0.1  0.3  0.5  0.7  0.9  1.1  1.3  1.5  1.7

$xname
[1] ".x[[i]]"

$equidist
[1] TRUE

attr(,"class")
[1] "histogram"

df_us_cpt_prev_socdist %>% dim()
[1] 2397   22
df_us_cpt_prev_socdist %>% drop_na() %>% dim()
[1] 999  22

for(i in head(df_us_socdist_cpt_results,18)){
  plot(i)
}

NA

Predicting change points

Linear models predicting change points (no controls)

lm_cpt_socdist_pers %>% summary()

Call:
lm(formula = cpt_day_socdist ~ pers_o + pers_c + pers_e + pers_a + 
    pers_n, data = df_us_cpt_prev_socdist)

Residuals:
   Min     1Q Median     3Q    Max 
-9.284 -3.119 -1.269  1.149 12.870 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 11.48853    0.09662 118.902  < 2e-16 ***
pers_o      -0.26056    0.10496  -2.483 0.013112 *  
pers_c       0.15915    0.13093   1.216 0.224291    
pers_e      -0.35704    0.10756  -3.319 0.000915 ***
pers_a       0.61317    0.13367   4.587 4.72e-06 ***
pers_n       0.14504    0.12217   1.187 0.235243    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 4.731 on 2391 degrees of freedom
Multiple R-squared:  0.02744,   Adjusted R-squared:  0.02541 
F-statistic: 13.49 on 5 and 2391 DF,  p-value: 5.281e-13

Linear models predicting change points with controls

lm_cpt_socdist_pers %>% summary()

Call:
lm(formula = cpt_day_socdist ~ pers_o + pers_c + pers_e + pers_a + 
    pers_n + airport_distance + republican + medage + male + 
    popdens + manufact + tourism + academics + medinc + physician_pc, 
    data = df_us_cpt_prev_socdist)

Residuals:
   Min     1Q Median     3Q    Max 
-9.784 -3.111 -1.220  1.563 12.148 

Coefficients:
                  Estimate Std. Error t value Pr(>|t|)    
(Intercept)      11.476908   0.094787 121.080  < 2e-16 ***
pers_o            0.083785   0.121134   0.692  0.48921    
pers_c           -0.008695   0.130595  -0.067  0.94692    
pers_e           -0.256051   0.107138  -2.390  0.01693 *  
pers_a            0.415986   0.137692   3.021  0.00255 ** 
pers_n           -0.314164   0.131217  -2.394  0.01673 *  
airport_distance  0.077067   0.102536   0.752  0.45236    
republican        0.533550   0.115417   4.623 3.99e-06 ***
medage            0.156947   0.100341   1.564  0.11792    
male              0.017010   0.104308   0.163  0.87047    
popdens           0.158159   0.101480   1.559  0.11924    
manufact          0.077820   0.114000   0.683  0.49490    
tourism          -0.280650   0.114817  -2.444  0.01458 *  
academics         0.066161   0.189260   0.350  0.72669    
medinc           -0.873767   0.156588  -5.580 2.68e-08 ***
physician_pc      0.139214   0.104699   1.330  0.18376    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 4.636 on 2376 degrees of freedom
  (5 observations deleted due to missingness)
Multiple R-squared:  0.07064,   Adjusted R-squared:  0.06477 
F-statistic: 12.04 on 15 and 2376 DF,  p-value: < 2.2e-16
---
title: "COVID-19 US"
author: "Heinrich Peters"
date: "4/15/2020"
output: html_notebook
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)

# MAC
 knitr::opts_knit$set(root.dir = '/Users/hp2500/Google Drive/STUDY/Columbia/Research/Corona/Data/US')

library(lmerTest)
library(nlme)
library(psych)
library(ggplot2)
library(dplyr)
library(tidyverse)
library(party)
library(doParallel)

```


# Prepare county level data 

### Overview of time windows
US prevalence: 03/09 - 04/18
US socdist: 03/01 - 05/03

UK prevalence: 03/09 - 04/10
UK socdist: 03/01 - 03/31

GER prevalence: 01/01 - 04/25
GER socdist: 02/25 - 04/27

### Read and format prevalence data 
```{r, warning=FALSE, message=FALSE}

df_us_covid <- read_csv('timeseries_usa_county_march1_april_09.csv')
df_us_covid$time %>% max()

df_us_covid <- df_us_covid %>% 
  filter(time <=31) %>% 
  arrange(countyfips) %>%
  mutate(stabil = -stabil) %>%
  dplyr::rename(county_fips = countyfips,
         pers_o = open, 
         pers_c = sci,
         pers_e = extra,
         pers_a = agree,
         pers_n = stabil)


df_us_covid <- df_us_covid %>% 
  dplyr::select(county_fips, time, mark, rate_day, pers_o,
                pers_c, pers_e, pers_a, pers_n)


df_us_covid %>% head()
```

### Conty level controls 
```{r}

df_us_ctrl <- read.csv('controls_US.csv')

df_us_ctrl <- df_us_ctrl %>% select(-county_name) %>% 
  rename(county_fips = county)

df_us_ctrl %>% head()

```


### Social distancing data unacast
```{r, warning=FALSE, message=FALSE}

df_us_socdist <- read_csv('0409_sds-full-county.csv')

# create sequence of dates
date_sequence <- seq.Date(as.Date('2020-03-09'),
                          as.Date('2020-03-31'), 1)
                     

# create data frame with time sequence
df_dates = tibble(date_sequence, 1:length(date_sequence)) 
names(df_dates) <- c('date', 'time')

# merge day index with gps data
df_us_socdist = df_us_socdist %>% 
  merge(df_dates, by='date') %>% 
  arrange(county_fips) %>%
  as_tibble()

df_us_socdist %>% head()
```


### Social distancing data FB
```{r, warning=FALSE, message=FALSE}

fb_files <- list.files('../FB Data/US individual files/Mobility/',
                       '*.csv', full.names = T)

df_us_socdist_fb <- fb_files %>% 
  map(read_csv) %>% bind_rows()

df_us_socdist_fb$ds %>% summary()

df_us_socdist_fb <- df_us_socdist_fb %>%
  select(-age_bracket, -gender, -baseline_name, -baseline_type) %>%
  rename(date = ds,
         county_fips = polygon_id,
         county_name = polygon_name,
         socdist_tiles = all_day_bing_tiles_visited_relative_change,
         socdist_single_tile = all_day_ratio_single_tile_users)

df_us_socdist_fb <- df_us_socdist_fb %>%
  filter(date >= '2020-03-09' & date <= '2020-03-31') %>%
  group_by(county_fips) %>% 
  arrange(date) %>% 
  mutate(time = row_number()) %>%
  ungroup() %>% 
  arrange(county_fips)

head(df_us_socdist_fb)
```

### Sanity check socdist data
```{r}
socdist <- df_us_socdist %>% merge(df_us_socdist_fb, by = c("county_fips", "time")) 

socdist[c('daily_distance_diff', 'daily_visitation_diff', 'socdist_tiles', 'socdist_single_tile')] %>% 
  cor(use = 'pairwise.complete')

```


### Merge data
```{r}

df_us <- plyr::join_all(list(df_us_covid, df_us_socdist_fb),
                  by = c('county_fips', 'time'), 
                  type = 'inner') %>% 
  plyr::join(df_us_ctrl, by='county_fips') %>% 
  arrange(county_fips, time)

# keep only counties with full data
fips_complete <- df_us %>% 
  group_by(county_fips) %>% 
  summarize(n = n()) %>% 
  filter(n==max(.$n)) %>% 
  .$county_fips

df_us <- df_us %>% filter(county_fips %in% fips_complete)

```

## Explore data
### Plot distributions
```{r, warning=FALSE}

# distribution of observations per county
df_us %>% group_by(county_fips) %>% 
  summarise(mark = mean(mark)) %>% 
  ggplot(aes(x=mark)) + 
  geom_histogram(color="black", fill="white", binwidth = 300) +
  ggtitle('Distribution of observations per county')

  
# distributions of mean prevalence rates per county
df_us %>% group_by(county_fips) %>% 
  summarise(rate_day = mean(rate_day)) %>%
  ggplot(aes(x=rate_day)) + 
  geom_histogram(color="black", fill="white", binwidth = 0.01) +
  ggtitle('Distribution of mean prevalence rates by county')

  
# distribution of mean sd distance measue
df_us %>% group_by(county_fips) %>% 
  summarise(socdist_tiles = mean(socdist_tiles)) %>%
  ggplot(aes(x=socdist_tiles)) + 
  geom_histogram(color="black", fill="white", bins = 200) +
 ggtitle('Distribution of mean tiles visited measure by county')


# distribution of mean sd visit measue
df_us %>% group_by(county_fips) %>% 
  summarise(socdist_single_tile = mean(socdist_single_tile)) %>%
  ggplot(aes(x=socdist_single_tile)) + 
  geom_histogram(color="black", fill="white", bins = 200) +
  ggtitle('Distribution of mean single tile measute by county')


```

### Plot prevalence over time
```{r}

df_us %>% sample_n(20000) %>%
  ggplot(aes(x=time, y=rate_day)) + 
  geom_point(aes(col=county_name, size=mark)) + 
  geom_smooth(method="loess", se=T) + 
  theme(legend.position="none") +
  ggtitle("Overall prevalence over time")



pers <- c('pers_o', 'pers_c', 'pers_e', 'pers_a', 'pers_n')

for (i in pers){

gg <- df_us %>% mutate(prev_tail = cut(.[[i]], 
                                       breaks = c(-Inf, quantile(.[[i]], 0.05), quantile(.[[i]], 0.95), Inf),
                                       labels = c('lower tail', 'center', 'upper tail'))) %>% 
  filter(prev_tail != 'center') %>%
  ggplot(aes(x=time, y=rate_day)) + 
  geom_point(aes(col=county_name, size=mark)) + 
  geom_smooth(method="loess", se=T) + 
  facet_wrap(~prev_tail) + 
  theme(legend.position="none") +
  ggtitle(i)

print(gg)
}

```

### Plot social distancing single tile visited
```{r}

df_us %>% sample_n(10000) %>%
  ggplot(aes(x=time, y=socdist_single_tile)) + 
  geom_point(aes(col=county_name, size=mark)) + 
  geom_smooth(method="loess", se=T) + 
  theme(legend.position="none") +
  ggtitle("Overall social distancing (single tile) over time")

pers <- c('pers_o', 'pers_c', 'pers_e', 'pers_a', 'pers_n')

for (i in pers){

gg <- df_us %>% mutate(dist_tail = cut(.[[i]], 
                                       breaks = c(-Inf, quantile(.[[i]], 0.05), quantile(.[[i]], 0.95), Inf),
                                       labels = c('lower tail', 'center', 'upper tail'))) %>% 
  filter(dist_tail != 'center') %>%
  ggplot(aes(x=time, y=socdist_single_tile)) + 
  geom_point(aes(col=county_name, size=mark)) + 
  geom_smooth(method="loess", se=T) + 
  facet_wrap(~dist_tail) + 
  theme(legend.position="none") +
  ggtitle(i)

print(gg)
}

```

### Control for weekend effect 
```{r}

weekend <- c(6, 7, 13, 14, 20, 21)

df_us_loess <- df_us %>% filter(!time %in% weekend) %>% 
  split(.$county_fips) %>%
  map(~ loess(socdist_single_tile ~ time, data = .)) %>%
  map(predict, 1:23) %>% 
  bind_rows() %>% 
  gather(key = 'county_fips', value = 'loess') %>% 
  group_by(county_fips) %>% 
  mutate(time = row_number())

df_us <- df_us %>% merge(df_us_loess, by=c('county_fips', 'time')) %>% 
  mutate(socdist_single_tile_clean = ifelse(time %in% weekend, loess,
                                            socdist_single_tile)) %>%
  arrange(county_fips, time)

df_us

```

```{r}

df_us %>% sample_n(10000) %>%
  ggplot(aes(x=time, y=socdist_single_tile_clean)) + 
  geom_point(aes(col=county_name, size=mark)) + 
  geom_smooth(method="loess", se=T) + 
  theme(legend.position="none") +
  ggtitle("Overall social distancing (single tile) over time")

pers <- c('pers_o', 'pers_c', 'pers_e', 'pers_a', 'pers_n')

for (i in pers){

gg <- df_us %>% mutate(dist_tail = cut(.[[i]], 
                                       breaks = c(-Inf, quantile(.[[i]], 0.05), quantile(.[[i]], 0.95), Inf),
                                       labels = c('lower tail', 'center', 'upper tail'))) %>% 
  filter(dist_tail != 'center') %>%
  ggplot(aes(x=time, y=socdist_single_tile_clean)) + 
  geom_point(aes(col=county_name, size=mark)) + 
  geom_smooth(method="loess", se=T) + 
  facet_wrap(~dist_tail) + 
  theme(legend.position="none") +
  ggtitle(i)

print(gg)
}

```

### Variance over time
```{r}

df_us %>% group_by(time) %>% 
  summarize(socdist_var = var(socdist_single_tile)) %>% 
  ggplot(aes(x=time, y=socdist_var)) +
  geom_line() +
  ggtitle("Variance of social distancing index over time")


df_us %>% group_by(time) %>% 
  summarize(socdist_var = var(socdist_single_tile),
            socdist_var_clean = var(socdist_single_tile_clean)) %>% 
  ggplot() +
  #geom_line(aes(x=time, y=socdist_var)) +
  geom_line(aes(x=time, y=socdist_var_clean)) +
  ggtitle("Variance of smothed social distancing index over time")

```




### Correlations 

```{r}

df_us %>% select(-time, -date, -county_name) %>% 
  group_by(county_fips) %>%
  summarize_if(is.numeric, mean) %>% 
  select(-county_fips) %>%
  cor(use='pairwise.complete.obs') %>% 
  round(3)
  
```


# Model building

## Prepare functions

```{r}

# function calculates all relevant models
run_models <- function(y, lvl1_x, lvl2_x, lvl2_id, data, ctrls=F){

  # subset data
  data = data %>% 
    dplyr::select(all_of(y), all_of(lvl1_x), all_of(lvl2_x), all_of(lvl2_id), 
                  popdens, rate_day, all_of(y))
  data = data %>% 
    dplyr::rename(y = all_of(y),
           lvl1_x = all_of(lvl1_x),
           lvl2_x = all_of(lvl2_x),
           lvl2_id = all_of(lvl2_id)
           )
  
  # configure optimization procedure
  ctrl_config <- lmeControl(opt = 'optim', maxIter = 100, msMaxIter = 100)

  # baseline
  baseline <- lme(fixed = y ~ 1, random = ~ 1 | lvl2_id, 
                    data = data,
                    correlation = corAR1(),
                  control = ctrl_config,
                  method = 'ML')

  # random intercept fixed slope
  random_intercept <- lme(fixed = y ~ lvl1_x + lvl2_x, 
                          random = ~ 1 | lvl2_id,
                            data = data,
                            correlation = corAR1(),
                  control = ctrl_config,
                  method = 'ML')

  # random intercept random slope
  random_slope <- lme(fixed = y ~ lvl1_x + lvl2_x, 
                      random = ~ lvl1_x | lvl2_id, 
                        data = data,
                        correlation = corAR1(),
                  control = ctrl_config,
                  method = 'ML')

  # cross level interaction
  interaction <- lme(fixed = y ~ lvl1_x * lvl2_x, 
                     random = ~ lvl1_x | lvl2_id, 
                       data = data,
                       correlation = corAR1(),
                  control = ctrl_config,
                  method = 'ML')
  
  # create list with results
  results <- list('baseline' = baseline, 
                  "random_intercept" = random_intercept, 
                  "random_slope" = random_slope,
                  "interaction" = interaction)
  
  
  if (ctrls == 'dem' | ctrls == 'prev'){
    
    # random intercept random slope
    random_slope_ctrl_dem <- lme(fixed = y ~ lvl1_x + lvl2_x + popdens,
                              random = ~ lvl1_x | lvl2_id, 
                          data = data,
                          correlation = corAR1(),
                    control = ctrl_config,
                  method = 'ML')
  
    # cross level interaction
    interaction_ctrl_main_dem <- lme(fixed = y ~ lvl1_x * lvl2_x + popdens,
                             random = ~ lvl1_x | lvl2_id, 
                         data = data,
                         correlation = corAR1(),
                    control = ctrl_config,
                  method = 'ML')
  
    # cross level interaction
    interaction_ctrl_int_dem <- lme(fixed = y ~ lvl1_x * lvl2_x + lvl1_x * popdens,
                             random = ~ lvl1_x | lvl2_id, 
                         data = data,
                         correlation = corAR1(),
                    control = ctrl_config,
                  method = 'ML')        
    
    # create list with results
    results <- list('baseline' = baseline, 
                    "random_intercept" = random_intercept, 
                    "random_slope" = random_slope,
                    "interaction" = interaction,
                    "random_slope_ctrl_dem" = random_slope_ctrl_dem,
                    "interaction_ctrl_main_dem" = interaction_ctrl_main_dem,
                    "interaction_ctrl_int_dem" = interaction_ctrl_int_dem)
  }
  
  if (ctrls == 'prev'){
  
    # random intercept random slope
    random_slope_ctrl_prev <- lme(fixed = y ~ lvl1_x + lvl2_x + popdens + rate_day,
                              random = ~ lvl1_x + rate_day | lvl2_id, 
                          data = data,
                          correlation = corAR1(),
                          control = ctrl_config,
                  method = 'ML')  
    
        # cross level interaction
    interaction_ctrl_main_prev <- lme(fixed = y ~ lvl1_x * lvl2_x + popdens + rate_day,
                             random = ~ lvl1_x | lvl2_id, 
                         data = data,
                         correlation = corAR1(),
                    control = ctrl_config,
                  method = 'ML')
  
  
    # cross level interaction
    interaction_ctrl_int_prev<- lme(fixed = y ~ lvl1_x * lvl2_x + lvl1_x * popdens + rate_day,
                             random = ~ lvl1_x + rate_day | lvl2_id, 
                         data = data,
                         correlation = corAR1(),
                          control = ctrl_config,
                  method = 'ML')
  
    # create list with results
    results <- list('baseline' = baseline, 
                    "random_intercept" = random_intercept, 
                    "random_slope" = random_slope,
                    "interaction" = interaction,
                    "random_slope_ctrl_dem" = random_slope_ctrl_dem,
                    "interaction_ctrl_main_dem" = interaction_ctrl_main_dem,
                    "interaction_ctrl_int_dem" = interaction_ctrl_int_dem,                    
                    "random_slope_ctrl_prev" = random_slope_ctrl_prev,
                    "interaction_ctrl_main_prev" = interaction_ctrl_main_prev,
                    "interaction_ctrl_int_prev" = interaction_ctrl_int_prev)
  }
  
  if(ctrls == 'exp'){
    # random intercept random slope
  random_slope_exp <- lme(fixed = y ~ (lvl1_x + I(lvl1_x^2)) + lvl2_x, 
                      random = ~ (lvl1_x + I(lvl1_x^2)) | lvl2_id, 
                        data = data,
                        correlation = corAR1(),
                  control = ctrl_config,
                  method = 'ML')

  # cross level interaction
  interaction_exp <- lme(fixed = y ~ (lvl1_x + I(lvl1_x^2)) * lvl2_x, 
                     random = ~ (lvl1_x + I(lvl1_x^2)) | lvl2_id, 
                       data = data,
                       correlation = corAR1(),
                  control = ctrl_config,
                  method = 'ML')  
  
  
  # create list with results
  results <- list('baseline' = baseline, 
                  "random_intercept" = random_intercept, 
                  "random_slope" = random_slope,
                  "interaction" = interaction,                  
                  "random_slope_exp" = random_slope_exp,
                  "interaction_exp" = interaction_exp)
  }
  
  return(results)
        
}

# extracts table with coefficients and tests statistics
extract_results <- function(models) {
  
  models_summary <- models %>% 
  map(summary) %>% 
  map("tTable") %>% 
  map(as.data.frame) %>% 
  map(round, 10) 
  # %>% map(~ .[str_detect(rownames(.), 'Inter|lvl'),])
  
  return(models_summary)
  
}


compare_models <- function(models) {

  mdl_names <- models %>% names()
  
  str = ''
  for (i in mdl_names){
    
    mdl_str <- paste('models$', i, sep = '')
    
    if(i == 'baseline'){
      str <- mdl_str
    }else{
    str <- paste(str, mdl_str, sep=', ')
    }
  }
  
  anova_str <- paste0('anova(', str, ')')
  mdl_comp <- eval(parse(text=anova_str))
  rownames(mdl_comp) = mdl_names
  return(mdl_comp)
}

```

## Rescale Data
```{r}

lvl2_scaled <- df_us %>% 
  select(-time, -mark, -date, -county_name, -rate_day,
         -socdist_tiles, -socdist_single_tile, -socdist_single_tile_clean, -loess) %>% 
  distinct() %>% 
  mutate_at(vars(-county_fips), scale)

lvl1_scaled <- df_us %>% 
  select(county_fips, time, rate_day, socdist_single_tile, socdist_single_tile_clean) %>% 
  mutate_at(vars(-county_fips, -time), scale)

df_us_scaled <- plyr::join(lvl1_scaled, lvl2_scaled, by = 'county_fips')

df_us_scaled
```


## Predict prevalence
### prevalence ~ openness
```{r}

models_o_covid <-run_models(y = 'rate_day',
                         lvl1_x = 'time',
                         lvl2_x = 'pers_o',
                         lvl2_id = 'county_fips',
                         data = df_us_scaled,
                         ctrls = 'dem')

extract_results(models_o_covid)

compare_models(models_o_covid)

```

### prevalence ~ conscientiousness
```{r}

models_c_covid <-run_models(y = 'rate_day', 
                         lvl1_x = 'time', 
                         lvl2_x = 'pers_c', 
                         lvl2_id = 'county_fips', 
                         data = df_us_scaled,
                         ctrls = 'dem')

extract_results(models_c_covid)

compare_models(models_c_covid)


```

### prevalence ~ extraversion
```{r}

models_e_covid <-run_models(y = 'rate_day', 
                         lvl1_x = 'time', 
                         lvl2_x = 'pers_e', 
                         lvl2_id = 'county_fips', 
                         data = df_us_scaled,
                         ctrls = 'dem')

extract_results(models_e_covid)

compare_models(models_e_covid)


```

### prevalence ~ agreeableness
```{r}

models_a_covid <-run_models(y = 'rate_day', 
                         lvl1_x = 'time', 
                         lvl2_x = 'pers_a', 
                         lvl2_id = 'county_fips', 
                         data = df_us_scaled,
                         ctrls = 'dem')

extract_results(models_a_covid)

compare_models(models_a_covid)


```

### prevalence ~ neuroticism
```{r}

models_n_covid <-run_models(y = 'rate_day', 
                         lvl1_x = 'time', 
                         lvl2_x = 'pers_n', 
                         lvl2_id = 'county_fips', 
                         data = df_us_scaled,
                         ctrls = 'dem')

extract_results(models_n_covid)

compare_models(models_n_covid)


```


## Predict social distancing
### social distancing ~ openness
```{r}

models_o_sd <-run_models(y = 'socdist_single_tile', 
                         lvl1_x = 'time', 
                         lvl2_x = 'pers_o', 
                         lvl2_id = 'county_fips', 
                         data = df_us_scaled,
                         ctrls = 'prev')

extract_results(models_o_sd)

compare_models(models_o_sd)


```

### social distancing ~ conscientiousness
```{r}

models_c_sd <-run_models(y = 'socdist_single_tile', 
                         lvl1_x = 'time', 
                         lvl2_x = 'pers_c', 
                         lvl2_id = 'county_fips', 
                         data = df_us_scaled,
                         ctrls = 'prev')

extract_results(models_c_sd)

compare_models(models_c_sd)


```

### social distancing ~ extraversion
```{r}

models_e_sd <-run_models(y = 'socdist_single_tile', 
                         lvl1_x = 'time', 
                         lvl2_x = 'pers_e', 
                         lvl2_id = 'county_fips', 
                         data = df_us_scaled,
                         ctrls = 'prev')

extract_results(models_e_sd)

compare_models(models_e_sd)


```

### social distancing ~ agreeableness
```{r}

models_a_sd <-run_models(y = 'socdist_single_tile', 
                         lvl1_x = 'time', 
                         lvl2_x = 'pers_a', 
                         lvl2_id = 'county_fips', 
                         data = df_us_scaled,
                         ctrls = 'prev')

extract_results(models_a_sd)

compare_models(models_a_sd)


```

### social distancing ~ neuroticism
```{r}

models_n_sd <-run_models(y = 'socdist_single_tile', 
                         lvl1_x = 'time', 
                         lvl2_x = 'pers_n', 
                         lvl2_id = 'county_fips', 
                         data = df_us_scaled,
                         ctrls = 'prev')

extract_results(models_n_sd)

compare_models(models_n_sd)

```


## Create overview table 

### Define function to create overview tables
```{r}

summary_table <- function(models, dv_name, prev=F){

  temp_df_ctrl_main <- NULL
  temp_df_ctrl_int <- NULL
  temp_df_ctrl_int_prev <- NULL
  
  for (i in models){
    results <- i %>% extract_results()
    
    results_ctrl_main <- results$interaction_ctrl_main_dem['lvl1_x:lvl2_x',]
    temp_df_ctrl_main <- temp_df_ctrl_main %>% rbind(results_ctrl_main)
    
    results_ctrl_int <- results$interaction_ctrl_int_dem['lvl1_x:lvl2_x',]
    temp_df_ctrl_int <- temp_df_ctrl_int %>% rbind(results_ctrl_int)
    
    if(prev){
      results_ctrl_int_prev <- results$interaction_ctrl_int_prev['lvl1_x:lvl2_x',]
      temp_df_ctrl_int_prev <- temp_df_ctrl_int_prev %>% rbind(results_ctrl_int_prev)
    }
        
  }
  
  names_ctrl_main <- paste0(dv_name, '~', c('o', 'c', 'e', 'a', 'n'), '*time', '_crtl_popdens')
  rownames(temp_df_ctrl_main) <- names_ctrl_main

  names_ctrl_int <- paste0(dv_name, '~', c('o', 'c', 'e', 'a', 'n'), '*time', '_crtl_popdens*time')
  rownames(temp_df_ctrl_int) <- names_ctrl_int

  if(prev){
    names_ctrl_int_prev <- paste0(dv_name, '~', c('o', 'c', 'e', 'a', 'n'), '*time', '_crtl_popdens*time_prev')
    rownames(temp_df_ctrl_int_prev) <- names_ctrl_int_prev
    
    sum_tab <- rbind(temp_df_ctrl_main, temp_df_ctrl_int, temp_df_ctrl_int_prev) %>% round(4)
  }else{
    sum_tab <- rbind(temp_df_ctrl_main, temp_df_ctrl_int) %>% round(4)
  }


  
  return(sum_tab)

} 

```

### Create overview tables
```{r}
# prevalence
models_prev <- list(models_o_covid, 
                    models_c_covid, 
                    models_e_covid, 
                    models_a_covid, 
                    models_n_covid)

sum_tab_prev <- summary_table(models_prev, dv_name = 'prev')

write.table(sum_tab_prev, quote=F)

# social distancing
models_socdist <- list(models_o_sd, 
                       models_c_sd, 
                       models_e_sd, 
                       models_a_sd, 
                       models_n_sd)

sum_tab_socdist <- summary_table(models_socdist, dv_name = 'socdist', prev=T)

write.table(sum_tab_socdist, quote=F)


```




# Conditional random forest analysis 

### Extract slopes
```{r}

# slope prevalence
df_us_slope_prev <- df_us_scaled %>% split(.$county) %>% 
  map(~ lm(rate_day ~ time, data = .)) %>%
  map(coef) %>% 
  map_dbl('time') %>% 
  as.data.frame() %>% 
  rownames_to_column('county_fips') %>% 
  rename(slope_prev = '.')

df_us_slope_prev <- df_us_scaled %>% select(-time, -rate_day, -socdist_single_tile) %>%
  distinct() %>% 
  mutate(county_fips = as.character(county_fips)) %>%
  inner_join(df_us_slope_prev, by = 'county_fips') %>%
  drop_na()



# slope social distancing
df_us_slope_socdist <- df_us_scaled %>% split(.$county) %>% 
  map(~ lm(socdist_single_tile ~ time, data = .)) %>%
  map(coef) %>% 
  map_dbl('time') %>% 
  as.data.frame() %>% 
  rownames_to_column('county_fips') %>% 
  rename(slope_socdist = '.')

df_us_slope_socdist <- df_us_scaled %>% select(-time, -rate_day, -socdist_single_tile) %>%
  distinct() %>% 
  mutate(county_fips = as.character(county_fips)) %>%
  inner_join(df_us_slope_socdist, by = 'county_fips') %>%
  drop_na()

df_us_slope_socdist
```

### Explore distribution of slopes
```{r}
df_us_slope_prev %>% ggplot(aes(slope_prev)) + geom_histogram(bins = 100)

df_us_slope_socdist %>% ggplot(aes(slope_socdist)) + geom_histogram(bins = 100)

```

# CRF prevalence ~ openness
```{r}

ctrls <- cforest_unbiased(ntree=500, mtry=5)

crf_o_fit_prev <- cforest(slope_prev ~ pers_o + airport_distance + republican +
                          medage + male + popdens + manufact +
                          tourism + academics + medinc + physician_pc, 
                         df_us_slope_prev[-1], 
                         controls = ctrls)

crf_o_varimp_prev <- varimp(crf_o_fit_prev, nperm = 1)
crf_o_varimp_cond_prev <- varimp(crf_o_fit_prev, conditional = T, nperm = 1)

crf_o_varimp_prev %>% as.data.frame() %>% rownames_to_column('variable') %>%
  ggplot(aes(x=variable, y=.)) +
  geom_bar(stat = 'identity') + 
  theme(axis.text.x = element_text(angle = 90))

crf_o_varimp_cond_prev %>% as.data.frame() %>% rownames_to_column('variable') %>%
  ggplot(aes(x=variable, y=.)) +
  geom_bar(stat = 'identity') +
  theme(axis.text.x = element_text(angle = 90))

```

# CRF prevalence ~ conscientiousness
```{r}

ctrls <- cforest_unbiased(ntree=500, mtry=5)

crf_c_fit_prev <- cforest(slope_prev ~ pers_c + airport_distance + republican +
                          medage + male + popdens + manufact +
                          tourism + academics + medinc + physician_pc, 
                         df_us_slope_prev[-1], 
                         controls = ctrls)

crf_c_varimp_prev <- varimp(crf_c_fit_prev, nperm = 1)
crf_c_varimp_cond_prev <- varimp(crf_c_fit_prev, conditional = T, nperm = 1)

crf_c_varimp_prev %>% as.data.frame() %>% rownames_to_column('variable') %>%
  ggplot(aes(x=variable, y=.)) +
  geom_bar(stat = 'identity') + 
  theme(axis.text.x = element_text(angle = 90))

crf_c_varimp_cond_prev %>% as.data.frame() %>% rownames_to_column('variable') %>%
  ggplot(aes(x=variable, y=.)) +
  geom_bar(stat = 'identity') + 
  theme(axis.text.x = element_text(angle = 90))

```


# CRF prevalence ~ extraversion
```{r}

ctrls <- cforest_unbiased(ntree=500, mtry=5)

crf_e_fit_prev <- cforest(slope_prev ~ pers_e + airport_distance + republican +
                          medage + male + popdens + manufact +
                          tourism + academics + medinc + physician_pc, 
                         df_us_slope_prev[-1], 
                         controls = ctrls)

crf_e_varimp_prev <- varimp(crf_e_fit_prev, nperm = 1)
crf_e_varimp_cond_prev <- varimp(crf_e_fit_prev, conditional = T, nperm = 1)

crf_e_varimp_prev %>% as.data.frame() %>% rownames_to_column('variable') %>%
  ggplot(aes(x=variable, y=.)) +
  geom_bar(stat = 'identity') + 
  theme(axis.text.x = element_text(angle = 90))

crf_e_varimp_cond_prev %>% as.data.frame() %>% rownames_to_column('variable') %>%
  ggplot(aes(x=variable, y=.)) +
  geom_bar(stat = 'identity') + 
  theme(axis.text.x = element_text(angle = 90))

```


# CRF prevalence ~ agreeableness
```{r}

ctrls <- cforest_unbiased(ntree=500, mtry=5)

crf_a_fit_prev <- cforest(slope_prev ~ pers_a + airport_distance + republican +
                          medage + male + popdens + manufact +
                          tourism + academics + medinc + physician_pc, 
                         df_us_slope_prev[-1], 
                         controls = ctrls)

crf_a_varimp_prev <- varimp(crf_a_fit_prev, nperm = 1)
crf_a_varimp_cond_prev <- varimp(crf_a_fit_prev, conditional = T, nperm = 1)

crf_a_varimp_prev %>% as.data.frame() %>% rownames_to_column('variable') %>%
  ggplot(aes(x=variable, y=.)) +
  geom_bar(stat = 'identity') + 
  theme(axis.text.x = element_text(angle = 90))

crf_a_varimp_cond_prev %>% as.data.frame() %>% rownames_to_column('variable') %>%
  ggplot(aes(x=variable, y=.)) +
  geom_bar(stat = 'identity') + 
  theme(axis.text.x = element_text(angle = 90))

```


# CRF prevalence ~ neuroticism
```{r}

ctrls <- cforest_unbiased(ntree=500, mtry=5)

crf_n_fit_prev <- cforest(slope_prev ~ pers_n + airport_distance + republican +
                          medage + male + popdens + manufact +
                          tourism + academics + medinc + physician_pc, 
                         df_us_slope_prev[-1], 
                         controls = ctrls)

crf_n_varimp_prev <- varimp(crf_n_fit_prev, nperm = 1)
crf_n_varimp_cond_prev <- varimp(crf_n_fit_prev, conditional = T, nperm = 1)

crf_n_varimp_prev %>% as.data.frame() %>% rownames_to_column('variable') %>%
  ggplot(aes(x=variable, y=.)) +
  geom_bar(stat = 'identity') + 
  theme(axis.text.x = element_text(angle = 90))

crf_n_varimp_cond_prev %>% as.data.frame() %>% rownames_to_column('variable') %>%
  ggplot(aes(x=variable, y=.)) +
  geom_bar(stat = 'identity') + 
  theme(axis.text.x = element_text(angle = 90))

```


# CRF social distancing ~ openness
```{r}

ctrls <- cforest_unbiased(ntree=500, mtry=5)

crf_o_fit_socdist <- cforest(slope_socdist ~ pers_o + airport_distance + republican +
                          medage + male + popdens + manufact +
                          tourism + academics + medinc + physician_pc, 
                         df_us_slope_socdist[-1], 
                         controls = ctrls)

crf_o_varimp_socdist <- varimp(crf_o_fit_socdist, nperm = 1)
crf_o_varimp_cond_socdist <- varimp(crf_o_fit_socdist, conditional = T, nperm = 1)

crf_o_varimp_socdist %>% as.data.frame() %>% rownames_to_column('variable') %>%
  ggplot(aes(x=variable, y=.)) +
  geom_bar(stat = 'identity') + 
  theme(axis.text.x = element_text(angle = 90))

crf_o_varimp_cond_socdist %>% as.data.frame() %>% rownames_to_column('variable') %>%
  ggplot(aes(x=variable, y=.)) +
  geom_bar(stat = 'identity') + 
  theme(axis.text.x = element_text(angle = 90))

```

# CRF social distancing ~ conscientiousness
```{r}

ctrls <- cforest_unbiased(ntree=500, mtry=5)

crf_c_fit_socdist <- cforest(slope_socdist ~ pers_c + airport_distance + republican +
                          medage + male + popdens + manufact +
                          tourism + academics + medinc + physician_pc, 
                         df_us_slope_socdist[-1], 
                         controls = ctrls)

crf_c_varimp_socdist <- varimp(crf_c_fit_socdist, nperm = 1)
crf_c_varimp_cond_socdist <- varimp(crf_c_fit_socdist, conditional = T, nperm = 1)

crf_c_varimp_socdist %>% as.data.frame() %>% rownames_to_column('variable') %>%
  ggplot(aes(x=variable, y=.)) +
  geom_bar(stat = 'identity') + 
  theme(axis.text.x = element_text(angle = 90))

crf_c_varimp_cond_socdist %>% as.data.frame() %>% rownames_to_column('variable') %>%
  ggplot(aes(x=variable, y=.)) +
  geom_bar(stat = 'identity') + 
  theme(axis.text.x = element_text(angle = 90))

```

# CRF social distancing ~ extraversion
```{r}

ctrls <- cforest_unbiased(ntree=500, mtry=5)

crf_e_fit_socdist <- cforest(slope_socdist ~ pers_e + airport_distance + republican +
                          medage + male + popdens + manufact +
                          tourism + academics + medinc + physician_pc, 
                         df_us_slope_socdist[-1], 
                         controls = ctrls)

crf_e_varimp_socdist <- varimp(crf_e_fit_socdist, nperm = 1)
crf_e_varimp_cond_socdist <- varimp(crf_e_fit_socdist, conditional = T, nperm = 1)

crf_e_varimp_socdist %>% as.data.frame() %>% rownames_to_column('variable') %>%
  ggplot(aes(x=variable, y=.)) +
  geom_bar(stat = 'identity') + 
  theme(axis.text.x = element_text(angle = 90))

crf_e_varimp_cond_socdist %>% as.data.frame() %>% rownames_to_column('variable') %>%
  ggplot(aes(x=variable, y=.)) +
  geom_bar(stat = 'identity') + 
  theme(axis.text.x = element_text(angle = 90))

```

# CRF social distancing ~ agreeableness
```{r}

ctrls <- cforest_unbiased(ntree=500, mtry=5)

crf_a_fit_socdist <- cforest(slope_socdist ~ pers_a + airport_distance + republican +
                          medage + male + popdens + manufact +
                          tourism + academics + medinc + physician_pc, 
                         df_us_slope_socdist[-1], 
                         controls = ctrls)

crf_a_varimp_socdist <- varimp(crf_a_fit_socdist, nperm = 1)
crf_a_varimp_cond_socdist <- varimp(crf_a_fit_socdist, conditional = T, nperm = 1)

crf_a_varimp_socdist %>% as.data.frame() %>% rownames_to_column('variable') %>%
  ggplot(aes(x=variable, y=.)) +
  geom_bar(stat = 'identity') + 
  theme(axis.text.x = element_text(angle = 90))

crf_a_varimp_cond_socdist %>% as.data.frame() %>% rownames_to_column('variable') %>%
  ggplot(aes(x=variable, y=.)) +
  geom_bar(stat = 'identity') + 
  theme(axis.text.x = element_text(angle = 90))

```


# CRF social distancing ~ neuroticism
```{r}

ctrls <- cforest_unbiased(ntree=500, mtry=5)

crf_n_fit_socdist <- cforest(slope_socdist ~ pers_n + airport_distance + republican +
                          medage + male + popdens + manufact +
                          tourism + academics + medinc + physician_pc, 
                         df_us_slope_socdist[-1], 
                         controls = ctrls)

crf_n_varimp_socdist <- varimp(crf_n_fit_socdist, nperm = 1)
crf_n_varimp_cond_socdist <- varimp(crf_n_fit_socdist, conditional = T, nperm = 1)

crf_n_varimp_socdist %>% as.data.frame() %>% rownames_to_column('variable') %>%
  ggplot(aes(x=variable, y=.)) +
  geom_bar(stat = 'identity') + 
  theme(axis.text.x = element_text(angle = 90))

crf_n_varimp_cond_socdist %>% as.data.frame() %>% rownames_to_column('variable') %>%
  ggplot(aes(x=variable, y=.)) +
  geom_bar(stat = 'identity') + 
  theme(axis.text.x = element_text(angle = 90))

```

# Linear models predicting slopes from personality
```{r}

lm_slope_prev_pers <- lm(slope_prev ~ pers_o + pers_c + pers_e + pers_a + pers_n, 
                         data = df_us_slope_prev)
lm_slope_prev_pers %>% summary()


lm_slope_socdist_pers <- lm(slope_socdist ~ pers_o + pers_c + pers_e + pers_a + pers_n, 
                            data = df_us_slope_socdist)
lm_slope_socdist_pers %>% summary()

```

# Linear models predicting slopes with controls
```{r}

lm_slope_prev_pers <- lm(slope_prev ~ pers_o + pers_c + pers_e + pers_a + pers_n + 
                           airport_distance + republican + medage + male + popdens + 
                           manufact + tourism + academics + medinc + physician_pc,
                         data = df_us_slope_prev)
lm_slope_prev_pers %>% summary()


lm_slope_socdist_pers <- lm(slope_socdist ~ pers_o + pers_c + pers_e + pers_a + pers_n + 
                              airport_distance + republican + medage + male + popdens + 
                              manufact + tourism + academics + medinc + physician_pc,
                            data = df_us_slope_socdist)
lm_slope_socdist_pers %>% summary()

```


```{r}
library(tigris)
library(ggplot2)
library(ggthemes)

me <- counties(cb = TRUE)
me_map <- fortify(me)
plot(me_map)

me_map %>% head()
me %>% head()

gg <- ggplot()
gg <- gg + geom_map(data=us_map_shape, map=us_map_shape,
                    aes( x = x, y = y, map_id=fips),
                    color="black", fill="white", size=0.25)
gg <- gg + coord_map()
gg <- gg + theme_map()
gg
```

```{r}
df_us_covid %>% filter(county == 23013)

```


```{r}

library(usmap)
library(ggplot2)

us_map_shape = us_map(regions = 'counties')

us_map_shape

plot_usmap(data = us_map_shape) + 
  labs(title = "US Counties",
       subtitle = "This is a blank map of the counties of the United States.") + 
  theme(panel.background = element_rect(color = "black", fill = "lightblue"))
```

# Change point analysis

```{r}
library(changepoint)
```

### Preparation
```{r}
# keep only counties with full data
fips_complete <- df_us_scaled %>% 
  group_by(county_fips) %>% 
  summarize(n = n()) %>% 
  filter(n==max(.$n)) %>% 
  .$county_fips
```


```{r}
df_us_scaled
```


### Prevalence
```{r}

# run changepoint analysis
df_us_prev_cpt_results <- df_us_scaled %>% select(county_fips, rate_day) %>%
  filter(county_fips %in% fips_complete) %>% 
  split(.$county_fips) %>%
  map(~ cpt.meanvar(as.vector(.$rate_day),
                    class=TRUE,
                    param.estimates=TRUE,
                    Q=1))

# calculate change point
df_us_prev_cpt_day <- df_us_prev_cpt_results %>% 
  map(cpts) %>% 
  unlist() %>% 
  as.data.frame() %>% 
  rename(cpt_day_prev = '.') %>%
  rownames_to_column('county_fips')

# calculate mean differences
df_us_prev_cpt_mean_diff <- df_us_prev_cpt_results %>% 
  map(param.est) %>% 
  map(~ .$mean) %>% 
  map(~ .[2]-.[1]) %>% 
  unlist() %>% 
  as.data.frame() %>% 
  rename(mean_diff_prev = '.') %>%
  rownames_to_column('county_fips')

# calculate varaince differences
df_us_prev_cpt_var_diff <- df_us_prev_cpt_results %>% 
  map(param.est) %>% 
  map(~ .$variance) %>% 
  map(~ .[2]-.[1]) %>% 
  unlist() %>% 
  as.data.frame() %>% 
  rename(var_diff_prev = '.') %>%
  rownames_to_column('county_fips')

# merge new variables 
df_us_cpt_prev <- df_us_scaled %>%
  select(-time, -rate_day, -socdist_single_tile, -socdist_single_tile_clean) %>%
  distinct() %>%
  mutate(county_fips = as.character(county_fips)) %>%
  left_join(df_us_prev_cpt_day, by='county_fips') %>%
  left_join(df_us_prev_cpt_mean_diff, by='county_fips') %>%
  left_join(df_us_prev_cpt_var_diff, by='county_fips')

df_us_cpt_prev %>% select(cpt_day_prev) %>% map(hist)
df_us_cpt_prev %>% select(mean_diff_prev) %>% map(hist)
df_us_cpt_prev %>% select(var_diff_prev) %>% map(hist)

df_us_cpt_prev %>% dim()
df_us_cpt_prev %>% drop_na() %>% dim()

```


```{r}

for(i in head(df_us_prev_cpt_results,18)){
  plot(i)
}

```

### Socdist
```{r}

# run changepoint analysis
df_us_socdist_cpt_results <- df_us_scaled %>% select(county_fips, socdist_single_tile_clean) %>%
  filter(county_fips %in% fips_complete) %>% 
  split(.$county_fips) %>%
  map(~ cpt.meanvar(as.vector(.$socdist_single_tile_clean),
                    #penalty = 'Asymptotic',
                    class=TRUE,
                    param.estimates=TRUE,
                    Q=1,
                    test.stat = 'Normal'))

# calculate change point
df_us_socdist_cpt_day <- df_us_socdist_cpt_results %>% 
  map(cpts) %>% 
  unlist() %>% 
  as.data.frame() %>% 
  rename(cpt_day_socdist = '.') %>%
  rownames_to_column('county_fips')

# calculate mean differences
df_us_socdist_cpt_mean_diff <- df_us_socdist_cpt_results %>% 
  map(param.est) %>% 
  map(~ .$mean) %>% 
  map(~ .[2]-.[1]) %>% 
  unlist() %>% 
  as.data.frame() %>% 
  rename(mean_diff_socdist = '.') %>%
  rownames_to_column('county_fips')

# calculate varaince differences
df_us_socdist_cpt_var_diff <- df_us_socdist_cpt_results %>% 
  map(param.est) %>% 
  map(~ .$variance) %>% 
  map(~ .[2]-.[1]) %>% 
  unlist() %>% 
  as.data.frame() %>% 
  rename(var_diff_socdist = '.') %>%
  rownames_to_column('county_fips')

# merge new variables 
df_us_cpt_prev_socdist <- df_us_cpt_prev %>%
  left_join(df_us_socdist_cpt_day, by='county_fips') %>%
  left_join(df_us_socdist_cpt_mean_diff, by='county_fips') %>%
  left_join(df_us_socdist_cpt_var_diff, by='county_fips')

df_us_cpt_prev_socdist %>% select(cpt_day_socdist) %>% map(hist)
df_us_cpt_prev_socdist %>% select(mean_diff_socdist) %>% map(hist)
df_us_cpt_prev_socdist %>% select(var_diff_socdist) %>% map(hist)

df_us_cpt_prev_socdist %>% dim()
df_us_cpt_prev_socdist %>% drop_na() %>% dim()

```

```{r}

for(i in head(df_us_socdist_cpt_results,18)){
  plot(i)
}

```
```{r}
df_us_cpt_prev_socdist
```

# Predicting change points 
### Linear models predicting change points (no controls)
```{r}

lm_cpr_prev_pers <- lm(cpt_day_prev ~ pers_o + pers_c + pers_e + pers_a + pers_n, 
                         data = df_us_cpt_prev_socdist)
lm_cpr_prev_pers %>% summary()


lm_cpt_socdist_pers <- lm(cpt_day_socdist ~ pers_o + pers_c + pers_e + pers_a + pers_n, 
                            data = df_us_cpt_prev_socdist)
lm_cpt_socdist_pers %>% summary()

```

# Linear models predicting change points with controls
```{r}

lm_cpt_prev_pers <- lm(cpt_day_prev ~ pers_o + pers_c + pers_e + pers_a + pers_n + 
                           airport_distance + republican + medage + male + popdens + 
                           manufact + tourism + academics + medinc + physician_pc,
                         data = df_us_cpt_prev_socdist)
lm_cpt_prev_pers %>% summary()

lm_cpt_socdist_pers <- lm(cpt_day_socdist ~ pers_o + pers_c + pers_e + pers_a + pers_n + 
                              airport_distance + republican + medage + male + popdens + 
                              manufact + tourism + academics + medinc + physician_pc,
                            data = df_us_cpt_prev_socdist)
lm_cpt_socdist_pers %>% summary()

```


